48 #define HWY_NEON_BUILD_TPL_1
49 #define HWY_NEON_BUILD_TPL_2
50 #define HWY_NEON_BUILD_TPL_3
54 #define HWY_NEON_BUILD_RET_1(type, size) Vec128<type##_t, size>
55 #define HWY_NEON_BUILD_RET_2(type, size) Vec128<type##_t, size>
56 #define HWY_NEON_BUILD_RET_3(type, size) Vec128<type##_t, size>
59 #define HWY_NEON_BUILD_PARAM_1(type, size) const Vec128<type##_t, size> a
60 #define HWY_NEON_BUILD_PARAM_2(type, size) \
61 const Vec128<type##_t, size> a, const Vec128<type##_t, size> b
62 #define HWY_NEON_BUILD_PARAM_3(type, size) \
63 const Vec128<type##_t, size> a, const Vec128<type##_t, size> b, \
64 const Vec128<type##_t, size> c
68 #define HWY_NEON_BUILD_ARG_1 a.raw
69 #define HWY_NEON_BUILD_ARG_2 a.raw, b.raw
70 #define HWY_NEON_BUILD_ARG_3 a.raw, b.raw, c.raw
79 #define HWY_NEON_EVAL(func, ...) func(__VA_ARGS__)
85 #define HWY_NEON_DEF_FUNCTION(type, size, name, prefix, infix, suffix, args) \
86 HWY_CONCAT(HWY_NEON_BUILD_TPL_, args) \
87 HWY_API HWY_CONCAT(HWY_NEON_BUILD_RET_, args)(type, size) \
88 name(HWY_CONCAT(HWY_NEON_BUILD_PARAM_, args)(type, size)) { \
89 return HWY_CONCAT(HWY_NEON_BUILD_RET_, args)(type, size)( \
90 HWY_NEON_EVAL(prefix##infix##suffix, HWY_NEON_BUILD_ARG_##args)); \
100 #define HWY_NEON_DEF_FUNCTION_UINT_8(name, prefix, infix, args) \
101 HWY_NEON_DEF_FUNCTION(uint8, 16, name, prefix##q, infix, u8, args) \
102 HWY_NEON_DEF_FUNCTION(uint8, 8, name, prefix, infix, u8, args) \
103 HWY_NEON_DEF_FUNCTION(uint8, 4, name, prefix, infix, u8, args) \
104 HWY_NEON_DEF_FUNCTION(uint8, 2, name, prefix, infix, u8, args) \
105 HWY_NEON_DEF_FUNCTION(uint8, 1, name, prefix, infix, u8, args)
108 #define HWY_NEON_DEF_FUNCTION_INT_8(name, prefix, infix, args) \
109 HWY_NEON_DEF_FUNCTION(int8, 16, name, prefix##q, infix, s8, args) \
110 HWY_NEON_DEF_FUNCTION(int8, 8, name, prefix, infix, s8, args) \
111 HWY_NEON_DEF_FUNCTION(int8, 4, name, prefix, infix, s8, args) \
112 HWY_NEON_DEF_FUNCTION(int8, 2, name, prefix, infix, s8, args) \
113 HWY_NEON_DEF_FUNCTION(int8, 1, name, prefix, infix, s8, args)
116 #define HWY_NEON_DEF_FUNCTION_UINT_16(name, prefix, infix, args) \
117 HWY_NEON_DEF_FUNCTION(uint16, 8, name, prefix##q, infix, u16, args) \
118 HWY_NEON_DEF_FUNCTION(uint16, 4, name, prefix, infix, u16, args) \
119 HWY_NEON_DEF_FUNCTION(uint16, 2, name, prefix, infix, u16, args) \
120 HWY_NEON_DEF_FUNCTION(uint16, 1, name, prefix, infix, u16, args)
123 #define HWY_NEON_DEF_FUNCTION_INT_16(name, prefix, infix, args) \
124 HWY_NEON_DEF_FUNCTION(int16, 8, name, prefix##q, infix, s16, args) \
125 HWY_NEON_DEF_FUNCTION(int16, 4, name, prefix, infix, s16, args) \
126 HWY_NEON_DEF_FUNCTION(int16, 2, name, prefix, infix, s16, args) \
127 HWY_NEON_DEF_FUNCTION(int16, 1, name, prefix, infix, s16, args)
130 #define HWY_NEON_DEF_FUNCTION_UINT_32(name, prefix, infix, args) \
131 HWY_NEON_DEF_FUNCTION(uint32, 4, name, prefix##q, infix, u32, args) \
132 HWY_NEON_DEF_FUNCTION(uint32, 2, name, prefix, infix, u32, args) \
133 HWY_NEON_DEF_FUNCTION(uint32, 1, name, prefix, infix, u32, args)
136 #define HWY_NEON_DEF_FUNCTION_INT_32(name, prefix, infix, args) \
137 HWY_NEON_DEF_FUNCTION(int32, 4, name, prefix##q, infix, s32, args) \
138 HWY_NEON_DEF_FUNCTION(int32, 2, name, prefix, infix, s32, args) \
139 HWY_NEON_DEF_FUNCTION(int32, 1, name, prefix, infix, s32, args)
142 #define HWY_NEON_DEF_FUNCTION_UINT_64(name, prefix, infix, args) \
143 HWY_NEON_DEF_FUNCTION(uint64, 2, name, prefix##q, infix, u64, args) \
144 HWY_NEON_DEF_FUNCTION(uint64, 1, name, prefix, infix, u64, args)
147 #define HWY_NEON_DEF_FUNCTION_INT_64(name, prefix, infix, args) \
148 HWY_NEON_DEF_FUNCTION(int64, 2, name, prefix##q, infix, s64, args) \
149 HWY_NEON_DEF_FUNCTION(int64, 1, name, prefix, infix, s64, args)
152 #define HWY_NEON_DEF_FUNCTION_FLOAT_32(name, prefix, infix, args) \
153 HWY_NEON_DEF_FUNCTION(float32, 4, name, prefix##q, infix, f32, args) \
154 HWY_NEON_DEF_FUNCTION(float32, 2, name, prefix, infix, f32, args) \
155 HWY_NEON_DEF_FUNCTION(float32, 1, name, prefix, infix, f32, args)
158 #define HWY_NEON_DEF_FUNCTION_FLOAT_64(name, prefix, infix, args) \
159 HWY_NEON_DEF_FUNCTION(float64, 2, name, prefix##q, infix, f64, args) \
160 HWY_NEON_DEF_FUNCTION(float64, 1, name, prefix, infix, f64, args)
164 #define HWY_NEON_DEF_FUNCTION_ALL_FLOATS(name, prefix, infix, args) \
165 HWY_NEON_DEF_FUNCTION_FLOAT_32(name, prefix, infix, args) \
166 HWY_NEON_DEF_FUNCTION_FLOAT_64(name, prefix, infix, args)
168 #define HWY_NEON_DEF_FUNCTION_ALL_FLOATS(name, prefix, infix, args) \
169 HWY_NEON_DEF_FUNCTION_FLOAT_32(name, prefix, infix, args)
174 #define HWY_NEON_DEF_FUNCTION_UINT_8_16_32(name, prefix, infix, args) \
175 HWY_NEON_DEF_FUNCTION_UINT_8(name, prefix, infix, args) \
176 HWY_NEON_DEF_FUNCTION_UINT_16(name, prefix, infix, args) \
177 HWY_NEON_DEF_FUNCTION_UINT_32(name, prefix, infix, args)
180 #define HWY_NEON_DEF_FUNCTION_INT_8_16_32(name, prefix, infix, args) \
181 HWY_NEON_DEF_FUNCTION_INT_8(name, prefix, infix, args) \
182 HWY_NEON_DEF_FUNCTION_INT_16(name, prefix, infix, args) \
183 HWY_NEON_DEF_FUNCTION_INT_32(name, prefix, infix, args)
186 #define HWY_NEON_DEF_FUNCTION_UINTS(name, prefix, infix, args) \
187 HWY_NEON_DEF_FUNCTION_UINT_8_16_32(name, prefix, infix, args) \
188 HWY_NEON_DEF_FUNCTION_UINT_64(name, prefix, infix, args)
191 #define HWY_NEON_DEF_FUNCTION_INTS(name, prefix, infix, args) \
192 HWY_NEON_DEF_FUNCTION_INT_8_16_32(name, prefix, infix, args) \
193 HWY_NEON_DEF_FUNCTION_INT_64(name, prefix, infix, args)
196 #define HWY_NEON_DEF_FUNCTION_INTS_UINTS(name, prefix, infix, args) \
197 HWY_NEON_DEF_FUNCTION_INTS(name, prefix, infix, args) \
198 HWY_NEON_DEF_FUNCTION_UINTS(name, prefix, infix, args)
201 #define HWY_NEON_DEF_FUNCTION_ALL_TYPES(name, prefix, infix, args) \
202 HWY_NEON_DEF_FUNCTION_INTS_UINTS(name, prefix, infix, args) \
203 HWY_NEON_DEF_FUNCTION_ALL_FLOATS(name, prefix, infix, args)
207 #define vuzp1_s8(x, y) vuzp_s8(x, y).val[0]
208 #define vuzp1_u8(x, y) vuzp_u8(x, y).val[0]
209 #define vuzp1_s16(x, y) vuzp_s16(x, y).val[0]
210 #define vuzp1_u16(x, y) vuzp_u16(x, y).val[0]
211 #define vuzp1_s32(x, y) vuzp_s32(x, y).val[0]
212 #define vuzp1_u32(x, y) vuzp_u32(x, y).val[0]
213 #define vuzp1_f32(x, y) vuzp_f32(x, y).val[0]
214 #define vuzp1q_s8(x, y) vuzpq_s8(x, y).val[0]
215 #define vuzp1q_u8(x, y) vuzpq_u8(x, y).val[0]
216 #define vuzp1q_s16(x, y) vuzpq_s16(x, y).val[0]
217 #define vuzp1q_u16(x, y) vuzpq_u16(x, y).val[0]
218 #define vuzp1q_s32(x, y) vuzpq_s32(x, y).val[0]
219 #define vuzp1q_u32(x, y) vuzpq_u32(x, y).val[0]
220 #define vuzp1q_f32(x, y) vuzpq_f32(x, y).val[0]
221 #define vuzp2_s8(x, y) vuzp_s8(x, y).val[1]
222 #define vuzp2_u8(x, y) vuzp_u8(x, y).val[1]
223 #define vuzp2_s16(x, y) vuzp_s16(x, y).val[1]
224 #define vuzp2_u16(x, y) vuzp_u16(x, y).val[1]
225 #define vuzp2_s32(x, y) vuzp_s32(x, y).val[1]
226 #define vuzp2_u32(x, y) vuzp_u32(x, y).val[1]
227 #define vuzp2_f32(x, y) vuzp_f32(x, y).val[1]
228 #define vuzp2q_s8(x, y) vuzpq_s8(x, y).val[1]
229 #define vuzp2q_u8(x, y) vuzpq_u8(x, y).val[1]
230 #define vuzp2q_s16(x, y) vuzpq_s16(x, y).val[1]
231 #define vuzp2q_u16(x, y) vuzpq_u16(x, y).val[1]
232 #define vuzp2q_s32(x, y) vuzpq_s32(x, y).val[1]
233 #define vuzp2q_u32(x, y) vuzpq_u32(x, y).val[1]
234 #define vuzp2q_f32(x, y) vuzpq_f32(x, y).val[1]
235 #define vzip1_s8(x, y) vzip_s8(x, y).val[0]
236 #define vzip1_u8(x, y) vzip_u8(x, y).val[0]
237 #define vzip1_s16(x, y) vzip_s16(x, y).val[0]
238 #define vzip1_u16(x, y) vzip_u16(x, y).val[0]
239 #define vzip1_f32(x, y) vzip_f32(x, y).val[0]
240 #define vzip1_u32(x, y) vzip_u32(x, y).val[0]
241 #define vzip1_s32(x, y) vzip_s32(x, y).val[0]
242 #define vzip1q_s8(x, y) vzipq_s8(x, y).val[0]
243 #define vzip1q_u8(x, y) vzipq_u8(x, y).val[0]
244 #define vzip1q_s16(x, y) vzipq_s16(x, y).val[0]
245 #define vzip1q_u16(x, y) vzipq_u16(x, y).val[0]
246 #define vzip1q_s32(x, y) vzipq_s32(x, y).val[0]
247 #define vzip1q_u32(x, y) vzipq_u32(x, y).val[0]
248 #define vzip1q_f32(x, y) vzipq_f32(x, y).val[0]
249 #define vzip2_s8(x, y) vzip_s8(x, y).val[1]
250 #define vzip2_u8(x, y) vzip_u8(x, y).val[1]
251 #define vzip2_s16(x, y) vzip_s16(x, y).val[1]
252 #define vzip2_u16(x, y) vzip_u16(x, y).val[1]
253 #define vzip2_s32(x, y) vzip_s32(x, y).val[1]
254 #define vzip2_u32(x, y) vzip_u32(x, y).val[1]
255 #define vzip2_f32(x, y) vzip_f32(x, y).val[1]
256 #define vzip2q_s8(x, y) vzipq_s8(x, y).val[1]
257 #define vzip2q_u8(x, y) vzipq_u8(x, y).val[1]
258 #define vzip2q_s16(x, y) vzipq_s16(x, y).val[1]
259 #define vzip2q_u16(x, y) vzipq_u16(x, y).val[1]
260 #define vzip2q_s32(x, y) vzipq_s32(x, y).val[1]
261 #define vzip2q_u32(x, y) vzipq_u32(x, y).val[1]
262 #define vzip2q_f32(x, y) vzipq_f32(x, y).val[1]
265 template <
typename T,
size_t N>
326 struct Raw128<double, 2> {
327 using type = float64x2_t;
389 struct Raw128<double, 1> {
390 using type = float64x1_t;
484 template <
typename T,
size_t N = 16 /
sizeof(T)>
497 return *
this = (*
this * other);
500 return *
this = (*
this / other);
503 return *
this = (*
this + other);
506 return *
this = (*
this - other);
509 return *
this = (*
this & other);
512 return *
this = (*
this | other);
515 return *
this = (*
this ^ other);
521 template <
typename T>
524 template <
typename T>
528 template <
typename T,
size_t N = 16 /
sizeof(T)>
546 template <
typename T,
size_t N>
566 #define HWY_NEON_BUILD_TPL_HWY_CAST_TO_U8
567 #define HWY_NEON_BUILD_RET_HWY_CAST_TO_U8(type, size) \
568 Vec128<uint8_t, size * sizeof(type##_t)>
569 #define HWY_NEON_BUILD_PARAM_HWY_CAST_TO_U8(type, size) Vec128<type##_t, size> v
570 #define HWY_NEON_BUILD_ARG_HWY_CAST_TO_U8 v.raw
595 #undef HWY_NEON_BUILD_TPL_HWY_CAST_TO_U8
596 #undef HWY_NEON_BUILD_RET_HWY_CAST_TO_U8
597 #undef HWY_NEON_BUILD_PARAM_HWY_CAST_TO_U8
598 #undef HWY_NEON_BUILD_ARG_HWY_CAST_TO_U8
608 template <
size_t N, HWY_IF_LE64(
int8_t, N)>
613 template <
size_t N, HWY_IF_LE64(u
int16_t, N)>
618 template <
size_t N, HWY_IF_LE64(
int16_t, N)>
623 template <
size_t N, HWY_IF_LE64(u
int32_t, N)>
628 template <
size_t N, HWY_IF_LE64(
int32_t, N)>
633 template <
size_t N, HWY_IF_LE64(
float, N)>
709 template <
typename T,
size_t N,
typename FromT>
711 Vec128<FromT,
N *
sizeof(T) /
sizeof(FromT)>
v) {
718 #define HWY_NEON_BUILD_TPL_HWY_SET1
719 #define HWY_NEON_BUILD_RET_HWY_SET1(type, size) Vec128<type##_t, size>
720 #define HWY_NEON_BUILD_PARAM_HWY_SET1(type, size) \
721 Simd<type##_t, size, 0> , const type##_t t
722 #define HWY_NEON_BUILD_ARG_HWY_SET1 t
726 #undef HWY_NEON_BUILD_TPL_HWY_SET1
727 #undef HWY_NEON_BUILD_RET_HWY_SET1
728 #undef HWY_NEON_BUILD_PARAM_HWY_SET1
729 #undef HWY_NEON_BUILD_ARG_HWY_SET1
732 template <
typename T,
size_t N>
746 template <
typename T,
size_t N>
756 template <
typename T,
size_t N,
typename T2>
759 for (
size_t i = 0; i < 16 /
sizeof(T); ++i) {
760 lanes[i] =
static_cast<T
>(first +
static_cast<T2
>(i));
762 return Load(
d, lanes);
768 return vgetq_lane_u8(
v.raw, 0);
772 return vget_lane_u8(
v.raw, 0);
776 return vgetq_lane_s8(
v.raw, 0);
780 return vget_lane_s8(
v.raw, 0);
784 return vgetq_lane_u16(
v.raw, 0);
788 return vget_lane_u16(
v.raw, 0);
792 return vgetq_lane_s16(
v.raw, 0);
796 return vget_lane_s16(
v.raw, 0);
800 return vgetq_lane_u32(
v.raw, 0);
804 return vget_lane_u32(
v.raw, 0);
808 return vgetq_lane_s32(
v.raw, 0);
812 return vget_lane_s32(
v.raw, 0);
816 return vgetq_lane_u64(
v.raw, 0);
819 return vget_lane_u64(
v.raw, 0);
822 return vgetq_lane_s64(
v.raw, 0);
825 return vget_lane_s64(
v.raw, 0);
829 return vgetq_lane_f32(
v.raw, 0);
835 return vgetq_lane_f64(
v.raw, 0);
838 return vget_lane_f64(
v.raw, 0);
915 #pragma push_macro("HWY_NEON_DEF_FUNCTION")
916 #undef HWY_NEON_DEF_FUNCTION
917 #define HWY_NEON_DEF_FUNCTION(type, size, name, prefix, infix, suffix, args) \
918 template <int kBits> \
919 HWY_API Vec128<type##_t, size> name(const Vec128<type##_t, size> v) { \
920 return kBits == 0 ? v \
921 : Vec128<type##_t, size>(HWY_NEON_EVAL( \
922 prefix##infix##suffix, v.raw, HWY_MAX(1, kBits))); \
930 #pragma pop_macro("HWY_NEON_DEF_FUNCTION")
934 template <
int kBits,
size_t N>
936 static_assert(0 <= kBits && kBits < 32,
"Invalid shift count");
937 if (kBits == 0)
return v;
941 template <
int kBits,
size_t N>
943 static_assert(0 <= kBits && kBits < 64,
"Invalid shift count");
944 if (kBits == 0)
return v;
957 template <
size_t N, HWY_IF_LE64(u
int8_t, N)>
967 template <
size_t N, HWY_IF_LE64(u
int16_t, N)>
977 template <
size_t N, HWY_IF_LE64(u
int32_t, N)>
996 template <
size_t N, HWY_IF_LE64(
int8_t, N)>
1006 template <
size_t N, HWY_IF_LE64(
int16_t, N)>
1016 template <
size_t N, HWY_IF_LE64(
int32_t, N)>
1038 template <
size_t N, HWY_IF_LE64(u
int8_t, N)>
1050 template <
size_t N, HWY_IF_LE64(u
int16_t, N)>
1062 template <
size_t N, HWY_IF_LE64(u
int32_t, N)>
1084 template <
size_t N, HWY_IF_LE64(
int8_t, N)>
1094 template <
size_t N, HWY_IF_LE64(
int16_t, N)>
1104 template <
size_t N, HWY_IF_LE64(
int32_t, N)>
1121 template <
typename T,
size_t N>
1123 return v << Set(Simd<T, N, 0>(),
static_cast<T
>(bits));
1125 template <
typename T,
size_t N>
1142 template <
size_t N, HWY_IF_LE64(u
int16_t, N)>
1147 template <
size_t N, HWY_IF_LE64(u
int32_t, N)>
1163 template <
size_t N, HWY_IF_LE64(u
int16_t, N)>
1168 template <
size_t N, HWY_IF_LE64(
int32_t, N)>
1177 int32x4_t rlo = vmull_s16(vget_low_s16(a.
raw), vget_low_s16(b.
raw));
1178 #if HWY_ARCH_ARM_A64
1179 int32x4_t rhi = vmull_high_s16(a.
raw, b.
raw);
1181 int32x4_t rhi = vmull_s16(vget_high_s16(a.
raw), vget_high_s16(b.
raw));
1184 vuzp2q_s16(vreinterpretq_s16_s32(rlo), vreinterpretq_s16_s32(rhi)));
1188 uint32x4_t rlo = vmull_u16(vget_low_u16(a.
raw), vget_low_u16(b.
raw));
1189 #if HWY_ARCH_ARM_A64
1190 uint32x4_t rhi = vmull_high_u16(a.
raw, b.
raw);
1192 uint32x4_t rhi = vmull_u16(vget_high_u16(a.
raw), vget_high_u16(b.
raw));
1195 vuzp2q_u16(vreinterpretq_u16_u32(rlo), vreinterpretq_u16_u32(rhi)));
1198 template <
size_t N, HWY_IF_LE64(
int16_t, N)>
1201 int16x8_t hi_lo = vreinterpretq_s16_s32(vmull_s16(a.
raw, b.
raw));
1204 template <
size_t N, HWY_IF_LE64(u
int16_t, N)>
1207 uint16x8_t hi_lo = vreinterpretq_u16_u32(vmull_u16(a.
raw, b.
raw));
1214 template <
size_t N, HWY_IF_LE64(
int16_t, N)>
1233 #if HWY_ARCH_ARM_A64
1267 template <
size_t N, HWY_IF_LE64(
float, N)>
1276 #if defined(__ARM_VFPV4__) || HWY_ARCH_ARM_A64
1277 template <
size_t N, HWY_IF_LE64(
float, N)>
1279 const Vec128<float, N> x,
1280 const Vec128<float, N> add) {
1281 return Vec128<float, N>(vfma_f32(add.raw, mul.raw, x.raw));
1283 HWY_API Vec128<float>
MulAdd(
const Vec128<float> mul,
const Vec128<float> x,
1284 const Vec128<float> add) {
1285 return Vec128<float>(vfmaq_f32(add.raw, mul.raw, x.raw));
1293 return mul * x + add;
1297 #if HWY_ARCH_ARM_A64
1298 HWY_API Vec64<double>
MulAdd(
const Vec64<double> mul,
const Vec64<double> x,
1299 const Vec64<double> add) {
1300 return Vec64<double>(vfma_f64(add.raw, mul.raw, x.raw));
1302 HWY_API Vec128<double>
MulAdd(
const Vec128<double> mul,
const Vec128<double> x,
1303 const Vec128<double> add) {
1304 return Vec128<double>(vfmaq_f64(add.raw, mul.raw, x.raw));
1309 #if defined(__ARM_VFPV4__) || HWY_ARCH_ARM_A64
1310 template <
size_t N, HWY_IF_LE64(
float, N)>
1312 const Vec128<float, N> x,
1313 const Vec128<float, N> add) {
1314 return Vec128<float, N>(vfms_f32(add.raw, mul.raw, x.raw));
1316 HWY_API Vec128<float>
NegMulAdd(
const Vec128<float> mul,
const Vec128<float> x,
1317 const Vec128<float> add) {
1318 return Vec128<float>(vfmsq_f32(add.raw, mul.raw, x.raw));
1326 return add - mul * x;
1330 #if HWY_ARCH_ARM_A64
1331 HWY_API Vec64<double>
NegMulAdd(
const Vec64<double> mul,
const Vec64<double> x,
1332 const Vec64<double> add) {
1333 return Vec64<double>(vfms_f64(add.raw, mul.raw, x.raw));
1336 const Vec128<double> x,
1337 const Vec128<double> add) {
1338 return Vec128<double>(vfmsq_f64(add.raw, mul.raw, x.raw));
1358 #if HWY_ARCH_ARM_A64
1360 HWY_API Vec128<double, N>
MulSub(
const Vec128<double, N> mul,
1361 const Vec128<double, N> x,
1362 const Vec128<double, N> sub) {
1367 const Vec128<double, N> x,
1368 const Vec128<double, N> sub) {
1385 #if HWY_ARCH_ARM_A64
1411 const auto root =
v * recip;
1421 template <
typename T>
1427 template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
1431 using V8 = decltype(
Zero(d8));
1454 template <
typename T,
size_t N, HWY_IF_NOT_FLOAT(T)>
1457 return detail::reversed_andnot(mask, not_mask);
1461 template <
typename T,
size_t N, HWY_IF_FLOAT(T)>
1463 const Vec128<T, N> mask) {
1464 const DFromV<decltype(mask)>
d;
1466 VFromD<decltype(du)> ret =
1467 detail::reversed_andnot(
BitCast(du, mask),
BitCast(du, not_mask));
1497 template <
typename T,
size_t N>
1499 return Or(o,
And(a1, a2));
1504 template <
typename T,
size_t N>
1512 template <
typename T,
size_t N>
1517 template <
typename T,
size_t N>
1522 template <
typename T,
size_t N>
1529 #ifdef HWY_NATIVE_POPCNT
1530 #undef HWY_NATIVE_POPCNT
1532 #define HWY_NATIVE_POPCNT
1537 template <
typename T>
1542 template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
1550 template <
typename T>
1553 const uint8x16_t bytes = vcntq_u8(
BitCast(d8,
v).raw);
1556 template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
1560 const uint8x8_t bytes = vcnt_u8(
BitCast(d8,
v).raw);
1564 template <
typename T>
1567 const uint8x16_t bytes = vcntq_u8(
BitCast(d8,
v).raw);
1568 return Vec128<T>(vpaddlq_u16(vpaddlq_u8(bytes)));
1570 template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
1574 const uint8x8_t bytes = vcnt_u8(
BitCast(d8,
v).raw);
1578 template <
typename T>
1581 const uint8x16_t bytes = vcntq_u8(
BitCast(d8,
v).raw);
1582 return Vec128<T>(vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(bytes))));
1584 template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
1588 const uint8x8_t bytes = vcnt_u8(
BitCast(d8,
v).raw);
1589 return Vec128<T, N>(vpaddl_u32(vpaddl_u16(vpaddl_u8(bytes))));
1594 template <
typename T,
size_t N, HWY_IF_NOT_FLOAT(T)>
1618 template <
size_t N, HWY_IF_LE64(
int8_t, N)>
1622 template <
size_t N, HWY_IF_LE64(
int16_t, N)>
1626 template <
size_t N, HWY_IF_LE64(
int32_t, N)>
1630 template <
size_t N, HWY_IF_LE64(
float, N)>
1635 #if HWY_ARCH_ARM_A64
1636 HWY_API Vec128<double>
Abs(
const Vec128<double>
v) {
1637 return Vec128<double>(vabsq_f64(
v.raw));
1640 HWY_API Vec64<double>
Abs(
const Vec64<double>
v) {
1641 return Vec64<double>(vabs_f64(
v.raw));
1647 template <
typename T,
size_t N>
1650 static_assert(IsFloat<T>(),
"Only makes sense for floating-point");
1655 template <
typename T,
size_t N>
1658 static_assert(IsFloat<T>(),
"Only makes sense for floating-point");
1664 template <
typename T,
size_t N, HWY_IF_SIGNED(T)>
1674 template <
typename T,
size_t N>
1680 template <
typename T,
size_t N>
1687 template <
typename TFrom,
typename TTo,
size_t N>
1689 static_assert(
sizeof(TFrom) ==
sizeof(TTo),
"Must have same size");
1695 #define HWY_NEON_BUILD_TPL_HWY_IF
1696 #define HWY_NEON_BUILD_RET_HWY_IF(type, size) Vec128<type##_t, size>
1697 #define HWY_NEON_BUILD_PARAM_HWY_IF(type, size) \
1698 const Mask128<type##_t, size> mask, const Vec128<type##_t, size> yes, \
1699 const Vec128<type##_t, size> no
1700 #define HWY_NEON_BUILD_ARG_HWY_IF mask.raw, yes.raw, no.raw
1704 #undef HWY_NEON_BUILD_TPL_HWY_IF
1705 #undef HWY_NEON_BUILD_RET_HWY_IF
1706 #undef HWY_NEON_BUILD_PARAM_HWY_IF
1707 #undef HWY_NEON_BUILD_ARG_HWY_IF
1710 template <
typename T,
size_t N>
1717 template <
typename T,
size_t N>
1723 template <
typename T,
size_t N>
1726 static_assert(IsSigned<T>(),
"Only works for signed/float");
1734 template <
typename T,
size_t N>
1737 return Max(zero,
v);
1742 template <
typename T,
size_t N>
1747 template <
typename T,
size_t N>
1753 template <
typename T,
size_t N>
1759 template <
typename T,
size_t N>
1765 template <
typename T,
size_t N>
1797 #define HWY_NEON_BUILD_TPL_HWY_COMPARE
1798 #define HWY_NEON_BUILD_RET_HWY_COMPARE(type, size) Mask128<type##_t, size>
1799 #define HWY_NEON_BUILD_PARAM_HWY_COMPARE(type, size) \
1800 const Vec128<type##_t, size> a, const Vec128<type##_t, size> b
1801 #define HWY_NEON_BUILD_ARG_HWY_COMPARE a.raw, b.raw
1805 #if HWY_ARCH_ARM_A64
1814 template <
typename T,
size_t N>
1820 #if HWY_ARCH_ARM_A64
1831 #undef HWY_NEON_BUILD_TPL_HWY_COMPARE
1832 #undef HWY_NEON_BUILD_RET_HWY_COMPARE
1833 #undef HWY_NEON_BUILD_PARAM_HWY_COMPARE
1834 #undef HWY_NEON_BUILD_ARG_HWY_COMPARE
1842 const Vec128<int64_t, N> b) {
1843 const Simd<int32_t, N * 2, 0> d32;
1844 const Simd<int64_t, N, 0> d64;
1852 const Vec128<uint64_t, N> b) {
1853 const Simd<uint32_t, N * 2, 0> d32;
1854 const Simd<uint64_t, N, 0> d64;
1861 const Vec128<int64_t> b) {
1862 const int64x2_t sub = vqsubq_s64(a.raw, b.raw);
1866 const Vec64<int64_t> b) {
1867 const int64x1_t sub = vqsub_s64(a.raw, b.raw);
1873 const Vec128<uint64_t, N> b) {
1874 const DFromV<decltype(a)> du;
1876 const Vec128<uint64_t, N> msb =
AndNot(a, b) |
AndNot(a ^ b, a - b);
1884 template <
typename T,
size_t N>
1888 template <
typename T,
size_t N>
1895 template <
typename T,
size_t N>
1903 #define HWY_NEON_BUILD_TPL_HWY_TESTBIT
1904 #define HWY_NEON_BUILD_RET_HWY_TESTBIT(type, size) Mask128<type##_t, size>
1905 #define HWY_NEON_BUILD_PARAM_HWY_TESTBIT(type, size) \
1906 Vec128<type##_t, size> v, Vec128<type##_t, size> bit
1907 #define HWY_NEON_BUILD_ARG_HWY_TESTBIT v.raw, bit.raw
1909 #if HWY_ARCH_ARM_A64
1919 return (
v & bit) == bit;
1924 return (
v & bit) == bit;
1928 #undef HWY_NEON_BUILD_TPL_HWY_TESTBIT
1929 #undef HWY_NEON_BUILD_RET_HWY_TESTBIT
1930 #undef HWY_NEON_BUILD_PARAM_HWY_TESTBIT
1931 #undef HWY_NEON_BUILD_ARG_HWY_TESTBIT
1935 #if HWY_ARCH_ARM_A64
1943 #if HWY_ARCH_ARM_A64
1958 const
Vec128<uint64_t,
N> b) {
1959 #if HWY_ARCH_ARM_A64
1962 const DFromV<decltype(a)> du;
1974 #if HWY_ARCH_ARM_A64
1983 #if HWY_ARCH_ARM_A64
1996 const
Vec128<uint64_t,
N> b) {
1997 #if HWY_ARCH_ARM_A64
2000 const DFromV<decltype(a)> du;
2012 #if HWY_ARCH_ARM_A64
2021 #if HWY_ARCH_ARM_A64
2067 #if HWY_ARCH_ARM_A64
2070 return Vec128<double>(vld1q_f64(unaligned));
2112 #if HWY_ARCH_ARM_A64
2115 return Vec64<double>(vld1_f64(p));
2123 uint32x2_t a = vld1_dup_u32(
reinterpret_cast<const uint32_t*
>(p));
2128 uint32x2_t a = vld1_dup_u32(
reinterpret_cast<const uint32_t*
>(p));
2133 return Vec32<uint32_t>(vld1_dup_u32(
reinterpret_cast<const uint32_t*
>(p)));
2137 int32x2_t a = vld1_dup_s32(
reinterpret_cast<const int32_t*
>(p));
2142 int32x2_t a = vld1_dup_s32(
reinterpret_cast<const int32_t*
>(p));
2147 return Vec32<int32_t>(vld1_dup_s32(
reinterpret_cast<const int32_t*
>(p)));
2157 uint16x4_t a = vld1_dup_u16(
reinterpret_cast<const uint16_t*
>(p));
2163 vld1_dup_u16(
reinterpret_cast<const uint16_t*
>(p)));
2167 int16x4_t a = vld1_dup_s16(
reinterpret_cast<const int16_t*
>(p));
2192 const auto pu16 =
reinterpret_cast<const uint16_t*
>(p);
2199 const auto pu16 =
reinterpret_cast<const uint16_t*
>(p);
2204 template <
typename T,
size_t N>
2209 template <
typename T,
size_t N>
2216 template <
typename T,
size_t N, HWY_IF_LE128(T, N)>
2226 vst1q_u8(unaligned,
v.raw);
2230 vst1q_u16(unaligned,
v.raw);
2234 vst1q_u32(unaligned,
v.raw);
2238 vst1q_u64(unaligned,
v.raw);
2242 vst1q_s8(unaligned,
v.raw);
2246 vst1q_s16(unaligned,
v.raw);
2250 vst1q_s32(unaligned,
v.raw);
2254 vst1q_s64(unaligned,
v.raw);
2258 vst1q_f32(unaligned,
v.raw);
2260 #if HWY_ARCH_ARM_A64
2263 vst1q_f64(unaligned,
v.raw);
2305 #if HWY_ARCH_ARM_A64
2316 uint32x2_t a = vreinterpret_u32_u8(
v.raw);
2317 vst1_lane_u32(
reinterpret_cast<uint32_t*
>(p), a, 0);
2321 uint32x2_t a = vreinterpret_u32_u16(
v.raw);
2322 vst1_lane_u32(
reinterpret_cast<uint32_t*
>(p), a, 0);
2326 vst1_lane_u32(p,
v.raw, 0);
2330 int32x2_t a = vreinterpret_s32_s8(
v.raw);
2331 vst1_lane_s32(
reinterpret_cast<int32_t*
>(p), a, 0);
2335 int32x2_t a = vreinterpret_s32_s16(
v.raw);
2336 vst1_lane_s32(
reinterpret_cast<int32_t*
>(p), a, 0);
2340 vst1_lane_s32(p,
v.raw, 0);
2344 vst1_lane_f32(p,
v.raw, 0);
2351 uint16x4_t a = vreinterpret_u16_u8(
v.raw);
2352 vst1_lane_u16(
reinterpret_cast<uint16_t*
>(p), a, 0);
2356 vst1_lane_u16(p,
v.raw, 0);
2360 int16x4_t a = vreinterpret_s16_s8(
v.raw);
2361 vst1_lane_s16(
reinterpret_cast<int16_t*
>(p), a, 0);
2365 vst1_lane_s16(p,
v.raw, 0);
2372 vst1_lane_u8(p,
v.raw, 0);
2376 vst1_lane_s8(p,
v.raw, 0);
2384 const auto pu16 =
reinterpret_cast<uint16_t*
>(p);
2391 const auto pu16 =
reinterpret_cast<uint16_t*
>(p);
2396 template <
typename T,
size_t N>
2401 template <
typename T,
size_t N>
2406 const auto blended =
2415 template <
typename T,
size_t N>
2432 uint16x8_t a = vmovl_u8(
v.raw);
2447 uint16x8_t a = vmovl_u8(
v.raw);
2455 template <
size_t N, HWY_IF_LE64(u
int16_t, N)>
2460 template <
size_t N, HWY_IF_LE64(u
int32_t, N)>
2463 uint16x8_t a = vmovl_u8(
v.raw);
2471 template <
size_t N, HWY_IF_LE64(u
int64_t, N)>
2476 template <
size_t N, HWY_IF_LE64(
int16_t, N)>
2481 template <
size_t N, HWY_IF_LE64(
int32_t, N)>
2484 uint16x8_t a = vmovl_u8(
v.raw);
2485 uint32x4_t b = vmovl_u16(vget_low_u16(a));
2488 template <
size_t N, HWY_IF_LE64(
int32_t, N)>
2491 uint32x4_t a = vmovl_u16(
v.raw);
2502 int16x8_t a = vmovl_s8(
v.raw);
2523 int16x8_t a = vmovl_s8(
v.raw);
2524 int32x4_t b = vmovl_s16(vget_low_s16(a));
2541 const Vec128<float16_t, 4>
v) {
2542 const float32x4_t f32 = vcvt_f32_f16(vreinterpret_f16_u16(
v.raw));
2543 return Vec128<float>(f32);
2547 const Vec128<float16_t, N>
v) {
2548 const float32x4_t f32 = vcvt_f32_f16(vreinterpret_f16_u16(
v.raw));
2549 return Vec128<float, N>(vget_low_f32(f32));
2561 const auto sign = ShiftRight<15>(bits16);
2562 const auto biased_exp = ShiftRight<10>(bits16) &
Set(du32, 0x1F);
2563 const auto mantissa = bits16 &
Set(du32, 0x3FF);
2564 const auto subnormal =
2566 Set(df32, 1.0f / 16384 / 1024));
2568 const auto biased_exp32 = biased_exp +
Set(du32, 127 - 15);
2569 const auto mantissa32 =
ShiftLeft<23 - 10>(mantissa);
2570 const auto normal = ShiftLeft<23>(biased_exp32) | mantissa32;
2571 const auto bits32 =
IfThenElse(biased_exp ==
Zero(du32), subnormal, normal);
2572 return BitCast(df32, ShiftLeft<31>(sign) | bits32);
2577 #if HWY_ARCH_ARM_A64
2580 const Vec64<float>
v) {
2581 return Vec128<double>(vcvt_f64_f32(
v.raw));
2585 const Vec32<float>
v) {
2586 return Vec64<double>(vget_low_f64(vcvt_f64_f32(
v.raw)));
2590 const Vec64<int32_t>
v) {
2591 const int64x2_t i64 = vmovl_s32(
v.raw);
2592 return Vec128<double>(vcvtq_f64_s64(i64));
2596 const Vec32<int32_t>
v) {
2597 const int64x1_t i64 = vget_low_s64(vmovl_s32(
v.raw));
2598 return Vec64<double>(vcvt_f64_s64(i64));
2616 const uint16x4_t a = vqmovun_s32(
v.raw);
2625 const int16x4_t a = vqmovn_s32(
v.raw);
2634 template <
size_t N, HWY_IF_LE64(
int32_t, N)>
2639 template <
size_t N, HWY_IF_LE64(
int32_t, N)>
2644 template <
size_t N, HWY_IF_LE64(
int32_t, N)>
2647 const uint16x4_t a = vqmovun_s32(vcombine_s32(
v.raw,
v.raw));
2650 template <
size_t N, HWY_IF_LE64(
int16_t, N)>
2655 template <
size_t N, HWY_IF_LE64(
int32_t, N)>
2658 const int16x4_t a = vqmovn_s32(vcombine_s32(
v.raw,
v.raw));
2661 template <
size_t N, HWY_IF_LE64(
int16_t, N)>
2670 const Vec128<float>
v) {
2671 return Vec128<float16_t, 4>{vreinterpret_u16_f16(vcvt_f16_f32(
v.raw))};
2675 const Vec128<float, N>
v) {
2676 const float16x4_t f16 = vcvt_f16_f32(vcombine_f32(
v.raw,
v.raw));
2677 return Vec128<float16_t, N>(vreinterpret_u16_f16(f16));
2686 const Rebind<uint32_t, decltype(du16)> du;
2688 const auto bits32 =
BitCast(du,
v);
2689 const auto sign = ShiftRight<31>(bits32);
2690 const auto biased_exp32 = ShiftRight<23>(bits32) &
Set(du, 0xFF);
2691 const auto mantissa32 = bits32 &
Set(du, 0x7FFFFF);
2693 const auto k15 =
Set(di, 15);
2694 const auto exp =
Min(
BitCast(di, biased_exp32) -
Set(di, 127), k15);
2695 const auto is_tiny = exp <
Set(di, -24);
2697 const auto is_subnormal = exp <
Set(di, -14);
2698 const auto biased_exp16 =
2700 const auto sub_exp =
BitCast(du,
Set(di, -14) - exp);
2701 const auto sub_m = (
Set(du, 1) << (
Set(du, 10) - sub_exp)) +
2702 (mantissa32 >> (
Set(du, 13) + sub_exp));
2704 ShiftRight<13>(mantissa32));
2706 const auto sign16 = ShiftLeft<15>(sign);
2707 const auto normal16 = sign16 | ShiftLeft<10>(biased_exp16) | mantissa16;
2717 const Rebind<int32_t, decltype(dbf16)> di32;
2718 const Rebind<uint32_t, decltype(dbf16)> du32;
2719 const Rebind<uint16_t, decltype(dbf16)> du16;
2720 const auto bits_in_32 =
BitCast(di32, ShiftRight<16>(
BitCast(du32,
v)));
2724 #if HWY_ARCH_ARM_A64
2727 return Vec64<float>(vcvt_f32_f64(
v.raw));
2730 return Vec32<float>(vcvt_f32_f64(vcombine_f64(
v.raw,
v.raw)));
2734 const Vec128<double>
v) {
2735 const int64x2_t i64 = vcvtq_s64_f64(
v.raw);
2736 return Vec64<int32_t>(vqmovn_s64(i64));
2739 const Vec64<double>
v) {
2740 const int64x1_t i64 = vcvt_s64_f64(
v.raw);
2742 const int64x2_t i64x2 = vcombine_s64(i64, i64);
2743 return Vec32<int32_t>(vqmovn_s64(i64x2));
2750 const uint8x16_t w = vuzp1q_u8(org_v, org_v);
2753 template <
size_t N, HWY_IF_LE64(u
int32_t, N)>
2756 const uint8x8_t w = vuzp1_u8(org_v, org_v);
2771 uint16x8_t c = vcombine_u16(a.
raw, b.
raw);
2780 int16x8_t c = vcombine_s16(a.
raw, b.
raw);
2792 template <
size_t N, HWY_IF_LE64(
int32_t, N)>
2803 template <
size_t N, HWY_IF_LE64(
float, N)>
2809 #if HWY_ARCH_ARM_A64
2812 const Vec128<int64_t>
v) {
2813 return Vec128<double>(vcvtq_f64_s64(
v.raw));
2816 const Vec64<int64_t>
v) {
2817 return Vec64<double>(vcvt_f64_s64(
v.raw));
2822 const Vec128<double>
v) {
2823 return Vec128<int64_t>(vcvtq_s64_f64(
v.raw));
2826 const Vec64<double>
v) {
2827 return Vec64<int64_t>(vcvt_s64_f64(
v.raw));
2834 #if HWY_ARCH_ARM_A64
2872 const auto int_f =
ConvertTo(df, integer);
2887 const auto added = large +
v;
2888 const auto rounded = added - large;
2900 const auto int_f =
ConvertTo(df, integer);
2914 const auto int_f =
ConvertTo(df, integer);
2926 #if HWY_ARCH_ARM_A64
2929 return Vec128<int32_t>(vcvtnq_s32_f32(
v.raw));
2931 template <
size_t N, HWY_IF_LE64(
float, N)>
2933 return Vec128<int32_t, N>(vcvtn_s32_f32(
v.raw));
2951 template <
typename T,
size_t N, HWY_IF_LE64(u
int8_t, N)>
2983 #if HWY_ARCH_ARM_A64
2985 return Vec64<double>(vget_low_f64(
v.raw));
2989 template <
typename T,
size_t N>
2998 template <
int kBytes,
typename T,
class V128 = Vec128<T>>
3000 static_assert(0 < kBytes && kBytes < 16,
"kBytes must be in [1, 15]");
3002 uint8x16_t v8 = vextq_u8(
BitCast(d8, lo).raw,
BitCast(d8, hi).raw, kBytes);
3007 template <
int kBytes,
typename T>
3009 static_assert(0 < kBytes && kBytes < 8,
"kBytes must be in [1, 7]");
3011 uint8x8_t v8 = vext_u8(
BitCast(d8, lo).raw,
BitCast(d8, hi).raw, kBytes);
3023 template <
int kBytes>
3033 template <
class T,
size_t N, HWY_IF_LE64(T, N)>
3037 const auto zero64 =
Zero(d64);
3038 const decltype(zero64) v64(
v.raw);
3040 CombineShiftRightBytes<8 - kBytes>(d64, v64, zero64).raw);
3045 template <
class T,
size_t N>
3052 template <
class T,
size_t N>
3058 template <
int kBytes>
3060 template <
class T,
size_t N>
3064 if (
N *
sizeof(T) < 8) {
3065 constexpr
size_t kReg =
N *
sizeof(T) == 16 ? 16 : 8;
3066 const Simd<T, kReg /
sizeof(T), 0> dreg;
3070 return CombineShiftRightBytes<kBytes>(
d,
Zero(
d),
v);
3075 template <
class T,
size_t N>
3082 template <
class T,
size_t N>
3090 template <
int kBytes,
typename T,
size_t N>
3096 template <
int kBytes,
typename T,
size_t N>
3101 template <
int kLanes,
typename T,
size_t N>
3107 template <
int kLanes,
typename T,
size_t N>
3113 template <
int kBytes,
typename T,
size_t N>
3119 template <
int kLanes,
typename T,
size_t N>
3126 template <
int kBytes,
typename T,
size_t N, HWY_IF_LE32(T, N)>
3129 constexpr
size_t kSize =
N *
sizeof(T);
3130 static_assert(0 < kBytes && kBytes < kSize,
"kBytes invalid");
3134 using V64 =
VFromD<decltype(d_full8)>;
3135 const V64 hi64(
BitCast(d8, hi).raw);
3181 #if HWY_ARCH_ARM_A64
3183 const Vec128<double>
v) {
3184 return Vec64<double>(vget_high_f64(
v.raw));
3189 template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
3196 return Vec128<T, (
N + 1) / 2>(upper.raw);
3201 #if HWY_ARCH_ARM_A64
3203 template <
int kLane>
3205 static_assert(0 <= kLane && kLane < 8,
"Invalid lane");
3206 return Vec128<uint16_t>(vdupq_laneq_u16(
v.raw, kLane));
3208 template <
int kLane,
size_t N, HWY_IF_LE64(u
int16_t, N)>
3210 static_assert(0 <= kLane && kLane <
N,
"Invalid lane");
3211 return Vec128<uint16_t, N>(vdup_lane_u16(
v.raw, kLane));
3213 template <
int kLane>
3215 static_assert(0 <= kLane && kLane < 4,
"Invalid lane");
3216 return Vec128<uint32_t>(vdupq_laneq_u32(
v.raw, kLane));
3218 template <
int kLane,
size_t N, HWY_IF_LE64(u
int32_t, N)>
3220 static_assert(0 <= kLane && kLane <
N,
"Invalid lane");
3221 return Vec128<uint32_t, N>(vdup_lane_u32(
v.raw, kLane));
3223 template <
int kLane>
3225 static_assert(0 <= kLane && kLane < 2,
"Invalid lane");
3226 return Vec128<uint64_t>(vdupq_laneq_u64(
v.raw, kLane));
3231 template <
int kLane>
3233 static_assert(0 <= kLane && kLane < 8,
"Invalid lane");
3234 return Vec128<int16_t>(vdupq_laneq_s16(
v.raw, kLane));
3236 template <
int kLane,
size_t N, HWY_IF_LE64(
int16_t, N)>
3238 static_assert(0 <= kLane && kLane <
N,
"Invalid lane");
3239 return Vec128<int16_t, N>(vdup_lane_s16(
v.raw, kLane));
3241 template <
int kLane>
3243 static_assert(0 <= kLane && kLane < 4,
"Invalid lane");
3244 return Vec128<int32_t>(vdupq_laneq_s32(
v.raw, kLane));
3246 template <
int kLane,
size_t N, HWY_IF_LE64(
int32_t, N)>
3248 static_assert(0 <= kLane && kLane <
N,
"Invalid lane");
3249 return Vec128<int32_t, N>(vdup_lane_s32(
v.raw, kLane));
3251 template <
int kLane>
3253 static_assert(0 <= kLane && kLane < 2,
"Invalid lane");
3254 return Vec128<int64_t>(vdupq_laneq_s64(
v.raw, kLane));
3259 template <
int kLane>
3261 static_assert(0 <= kLane && kLane < 4,
"Invalid lane");
3262 return Vec128<float>(vdupq_laneq_f32(
v.raw, kLane));
3264 template <
int kLane,
size_t N, HWY_IF_LE64(
float, N)>
3266 static_assert(0 <= kLane && kLane <
N,
"Invalid lane");
3267 return Vec128<float, N>(vdup_lane_f32(
v.raw, kLane));
3269 template <
int kLane>
3271 static_assert(0 <= kLane && kLane < 2,
"Invalid lane");
3272 return Vec128<double>(vdupq_laneq_f64(
v.raw, kLane));
3274 template <
int kLane>
3276 static_assert(0 <= kLane && kLane < 1,
"Invalid lane");
3284 template <
int kLane>
3286 static_assert(0 <= kLane && kLane < 8,
"Invalid lane");
3289 template <
int kLane,
size_t N, HWY_IF_LE64(u
int16_t, N)>
3291 static_assert(0 <= kLane && kLane <
N,
"Invalid lane");
3294 template <
int kLane>
3296 static_assert(0 <= kLane && kLane < 4,
"Invalid lane");
3299 template <
int kLane,
size_t N, HWY_IF_LE64(u
int32_t, N)>
3301 static_assert(0 <= kLane && kLane <
N,
"Invalid lane");
3304 template <
int kLane>
3306 static_assert(0 <= kLane && kLane < 2,
"Invalid lane");
3312 template <
int kLane>
3314 static_assert(0 <= kLane && kLane < 8,
"Invalid lane");
3317 template <
int kLane,
size_t N, HWY_IF_LE64(
int16_t, N)>
3319 static_assert(0 <= kLane && kLane <
N,
"Invalid lane");
3322 template <
int kLane>
3324 static_assert(0 <= kLane && kLane < 4,
"Invalid lane");
3327 template <
int kLane,
size_t N, HWY_IF_LE64(
int32_t, N)>
3329 static_assert(0 <= kLane && kLane <
N,
"Invalid lane");
3332 template <
int kLane>
3334 static_assert(0 <= kLane && kLane < 2,
"Invalid lane");
3340 template <
int kLane>
3342 static_assert(0 <= kLane && kLane < 4,
"Invalid lane");
3345 template <
int kLane,
size_t N, HWY_IF_LE64(
float, N)>
3347 static_assert(0 <= kLane && kLane <
N,
"Invalid lane");
3353 template <
int kLane>
3355 static_assert(0 <= kLane && kLane < 1,
"Invalid lane");
3358 template <
int kLane>
3360 static_assert(0 <= kLane && kLane < 1,
"Invalid lane");
3367 template <
typename T,
size_t N>
3372 template <
typename T,
size_t N,
typename TI, HWY_IF_LE128(T, N)>
3374 static_assert(
sizeof(T) ==
sizeof(TI),
"Index size must match lane");
3375 #if HWY_IS_DEBUG_BUILD
3376 const Rebind<TI, decltype(
d)> di;
3382 using V8 =
VFromD<decltype(d8)>;
3386 static_assert(
sizeof(T) == 4 ||
sizeof(T) == 8,
"");
3387 if (
sizeof(T) == 4) {
3388 alignas(16) constexpr uint8_t kBroadcastLaneBytes[16] = {
3389 0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12};
3390 const V8 lane_indices =
3392 const V8 byte_indices =
3394 alignas(16) constexpr uint8_t kByteOffsets[16] = {0, 1, 2, 3, 0, 1, 2, 3,
3395 0, 1, 2, 3, 0, 1, 2, 3};
3396 const V8 sum =
Add(byte_indices,
Load(d8, kByteOffsets));
3399 alignas(16) constexpr uint8_t kBroadcastLaneBytes[16] = {
3400 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8};
3401 const V8 lane_indices =
3403 const V8 byte_indices =
3405 alignas(16) constexpr uint8_t kByteOffsets[16] = {0, 1, 2, 3, 4, 5, 6, 7,
3406 0, 1, 2, 3, 4, 5, 6, 7};
3407 const V8 sum =
Add(byte_indices,
Load(d8, kByteOffsets));
3412 template <
typename T,
size_t N,
typename TI, HWY_IF_LE128(T, N)>
3414 const Rebind<TI, decltype(
d)> di;
3418 template <
typename T,
size_t N>
3429 template <
typename T>
3435 template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
3440 template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
3446 template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
3452 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2)>
3460 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2), HWY_IF_LE64(T, N)>
3465 template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
3471 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4), HWY_IF_LE64(T, N)>
3476 template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
3482 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 8)>
3489 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2), HWY_IF_LE64(T, N)>
3494 template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
3500 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
3505 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 8)>
3512 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2)>
3517 template <
typename T,
size_t N, HWY_IF_NOT_LANE_SIZE(T, 2)>
3530 template <
typename T>
3534 template <
typename T>
3540 template <
typename T>
3546 template <
typename T>
3552 template <
typename T>
3565 #if HWY_ARCH_ARM_A64
3568 const Vec128<uint64_t> b) {
3569 return Vec128<uint64_t>(vzip1q_u64(a.raw, b.raw));
3572 const Vec128<int64_t> b) {
3573 return Vec128<int64_t>(vzip1q_s64(a.raw, b.raw));
3576 const Vec128<double> b) {
3577 return Vec128<double>(vzip1q_f64(a.raw, b.raw));
3596 template <
size_t N, HWY_IF_LE64(
float, N)>
3603 template <
typename T,
size_t N, HWY_IF_LE32(T, N)>
3609 template <
typename T,
size_t N,
class V = Vec128<T, N>>
3621 #if HWY_ARCH_ARM_A64
3624 const Vec128<uint64_t> b) {
3625 return Vec128<uint64_t>(vzip2q_u64(a.raw, b.raw));
3628 return Vec128<int64_t>(vzip2q_s64(a.raw, b.raw));
3631 return Vec128<double>(vzip2q_f64(a.raw, b.raw));
3655 template <
typename T,
size_t N, HWY_IF_GE64(T, N),
class V = Vec128<T, N>>
3661 template <
typename T,
size_t N, HWY_IF_LE32(T, N),
class V = Vec128<T, N>>
3663 const Half<decltype(
d)> d2;
3671 template <
class V,
class DW = RepartitionToW
ide<DFromV<V>>>
3675 template <
class V,
class D = DFromV<V>,
class DW = RepartitionToW
ide<D>>
3680 template <
class V,
class D = DFromV<V>,
class DW = RepartitionToW
ide<D>>
3747 #if HWY_ARCH_ARM_A64
3748 HWY_API Vec128<double>
Combine(Full128<double> , Vec64<double> hi,
3750 return Vec128<double>(vcombine_f64(lo.raw, hi.raw));
3755 template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
3768 template <
typename T,
size_t N>
3776 template <
typename T,
size_t N, HWY_IF_GE64(T, N)>
3785 #if HWY_ARCH_ARM_A64
3795 #define HWY_NEON_BUILD_TPL_HWY_TRN
3796 #define HWY_NEON_BUILD_RET_HWY_TRN(type, size) type##x##size##x2_t
3799 #define HWY_NEON_BUILD_PARAM_HWY_TRN(TYPE, size) \
3800 Raw128<TYPE##_t, size>::type a, Raw128<TYPE##_t, size>::type b
3801 #define HWY_NEON_BUILD_ARG_HWY_TRN a, b
3823 template <
typename T,
size_t N, HWY_IF_LE32(T, N)>
3828 #if HWY_ARCH_ARM_A64
3831 using VU =
VFromD<decltype(du)>;
3833 d, VU(detail::InterleaveEvenOdd(
BitCast(du, lo).raw,
BitCast(du, hi).raw)
3841 template <
typename T,
size_t N, HWY_IF_GE64(T, N)>
3850 template <
typename T,
size_t N, HWY_IF_LE32(T, N)>
3855 #if HWY_ARCH_ARM_A64
3858 using VU =
VFromD<decltype(du)>;
3860 d, VU(detail::InterleaveEvenOdd(
BitCast(du, lo).raw,
BitCast(du, hi).raw)
3868 template <
typename T,
size_t N, HWY_IF_GE64(T, N)>
3875 template <
typename T,
size_t N, HWY_IF_LE32(T, N)>
3878 constexpr
size_t kSize =
N *
sizeof(T);
3880 const Full64<uint8_t> d8x8;
3881 const Full64<T> d64;
3882 using V8x8 =
VFromD<decltype(d8x8)>;
3883 const V8x8 hi8x8(
BitCast(d8, hi).raw);
3888 return Vec128<T, N>(
BitCast(d64, r).raw);
3894 template <
typename T,
size_t N>
3917 template <
size_t N, HWY_IF_LE64(u
int32_t, N)>
3923 template <
size_t N, HWY_IF_LE64(
int32_t, N)>
3929 template <
size_t N, HWY_IF_LE64(
float, N)>
3937 template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
3959 template <
size_t N, HWY_IF_LE64(u
int32_t, N)>
3965 template <
size_t N, HWY_IF_LE64(
int32_t, N)>
3971 template <
size_t N, HWY_IF_LE64(
float, N)>
3979 template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
3986 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
3988 #if HWY_ARCH_ARM_A64
3989 return detail::InterleaveEven(
v,
v);
3991 return Vec128<T, N>(detail::InterleaveEvenOdd(
v.raw,
v.raw).val[0]);
3995 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 8)>
4002 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
4004 #if HWY_ARCH_ARM_A64
4005 return detail::InterleaveOdd(
v,
v);
4007 return Vec128<T, N>(detail::InterleaveEvenOdd(
v.raw,
v.raw).val[1]);
4011 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 8)>
4018 template <
typename T,
size_t N>
4022 alignas(16) constexpr uint8_t kBytes[16] = {
4023 ((0 /
sizeof(T)) & 1) ? 0 : 0xFF, ((1 /
sizeof(T)) & 1) ? 0 : 0xFF,
4024 ((2 /
sizeof(T)) & 1) ? 0 : 0xFF, ((3 /
sizeof(T)) & 1) ? 0 : 0xFF,
4025 ((4 /
sizeof(T)) & 1) ? 0 : 0xFF, ((5 /
sizeof(T)) & 1) ? 0 : 0xFF,
4026 ((6 /
sizeof(T)) & 1) ? 0 : 0xFF, ((7 /
sizeof(T)) & 1) ? 0 : 0xFF,
4027 ((8 /
sizeof(T)) & 1) ? 0 : 0xFF, ((9 /
sizeof(T)) & 1) ? 0 : 0xFF,
4028 ((10 /
sizeof(T)) & 1) ? 0 : 0xFF, ((11 /
sizeof(T)) & 1) ? 0 : 0xFF,
4029 ((12 /
sizeof(T)) & 1) ? 0 : 0xFF, ((13 /
sizeof(T)) & 1) ? 0 : 0xFF,
4030 ((14 /
sizeof(T)) & 1) ? 0 : 0xFF, ((15 /
sizeof(T)) & 1) ? 0 : 0xFF,
4037 template <
typename T,
size_t N>
4044 template <
typename T,
size_t N>
4052 template <
typename T>
4063 const Repartition<uint32_t, decltype(dbf16)> du32;
4070 #if defined(__ARM_FEATURE_AES)
4073 #ifdef HWY_NATIVE_AES
4074 #undef HWY_NATIVE_AES
4076 #define HWY_NATIVE_AES
4080 Vec128<uint8_t> round_key) {
4085 return Vec128<uint8_t>(vaesmcq_u8(vaeseq_u8(state.raw, vdupq_n_u8(0)))) ^
4090 Vec128<uint8_t> round_key) {
4091 return Vec128<uint8_t>(vaeseq_u8(state.raw, vdupq_n_u8(0))) ^ round_key;
4095 return Vec128<uint64_t>((uint64x2_t)vmull_p64(
GetLane(a),
GetLane(b)));
4099 return Vec128<uint64_t>(
4100 (uint64x2_t)vmull_high_p64((poly64x2_t)a.raw, (poly64x2_t)b.raw));
4110 const Rebind<uint16_t, decltype(df32)> du16;
4124 vmull_s32(vget_low_s32(a_packed), vget_low_s32(b_packed)));
4131 vmull_u32(vget_low_u32(a_packed), vget_low_u32(b_packed)));
4140 return Vec128<int64_t, (
N + 1) / 2>(
4141 vget_low_s64(vmull_s32(a_packed, b_packed)));
4149 return Vec128<uint64_t, (
N + 1) / 2>(
4150 vget_low_u64(vmull_u32(a_packed, b_packed)));
4155 uint64_t lo =
Mul128(vgetq_lane_u64(a.
raw, 0), vgetq_lane_u64(b.
raw, 0), &hi);
4161 uint64_t lo =
Mul128(vgetq_lane_u64(a.
raw, 1), vgetq_lane_u64(b.
raw, 1), &hi);
4168 template <
typename T,
typename TI>
4173 #if HWY_ARCH_ARM_A64
4177 uint8x16_t table0 =
BitCast(d8, bytes).raw;
4179 table.val[0] = vget_low_u8(table0);
4180 table.val[1] = vget_high_u8(table0);
4181 uint8x16_t idx =
BitCast(d8, from).raw;
4182 uint8x8_t low = vtbl2_u8(table, vget_low_u8(idx));
4183 uint8x8_t hi = vtbl2_u8(table, vget_high_u8(idx));
4189 template <
typename T,
typename TI,
size_t NI, HWY_IF_LE64(TI, NI)>
4194 const auto idx_full =
Combine(d_full, from64, from64);
4200 template <
typename T,
size_t N,
typename TI, HWY_IF_LE64(T, N)>
4208 template <
typename T,
size_t N,
typename TI,
size_t NI,
HWY_IF_LE64(T,
N),
4214 const Repartition<uint8_t, decltype(d_idx)> d_idx8;
4217 const auto from8 =
BitCast(d_idx8, from);
4218 const VFromD<decltype(d_idx8)> v8(vtbl1_u8(bytes8.raw, from8.raw));
4223 template <
class V,
class VI>
4230 template <
typename T,
size_t N,
typename Offset, HWY_IF_LE128(T, N)>
4234 static_assert(
sizeof(T) ==
sizeof(Offset),
"Must match for portability");
4236 alignas(16) T lanes[
N];
4239 alignas(16) Offset offset_lanes[
N];
4240 Store(offset,
Rebind<Offset, decltype(
d)>(), offset_lanes);
4242 uint8_t* base_bytes =
reinterpret_cast<uint8_t*
>(base);
4243 for (
size_t i = 0; i <
N; ++i) {
4244 CopyBytes<sizeof(T)>(&lanes[i], base_bytes + offset_lanes[i]);
4248 template <
typename T,
size_t N,
typename Index, HWY_IF_LE128(T, N)>
4251 static_assert(
sizeof(T) ==
sizeof(Index),
"Must match for portability");
4253 alignas(16) T lanes[
N];
4256 alignas(16) Index index_lanes[
N];
4257 Store(index,
Rebind<Index, decltype(
d)>(), index_lanes);
4259 for (
size_t i = 0; i <
N; ++i) {
4260 base[index_lanes[i]] = lanes[i];
4266 template <
typename T,
size_t N,
typename Offset>
4270 static_assert(
sizeof(T) ==
sizeof(Offset),
"Must match for portability");
4272 alignas(16) Offset offset_lanes[
N];
4273 Store(offset,
Rebind<Offset, decltype(
d)>(), offset_lanes);
4275 alignas(16) T lanes[
N];
4276 const uint8_t* base_bytes =
reinterpret_cast<const uint8_t*
>(base);
4277 for (
size_t i = 0; i <
N; ++i) {
4278 CopyBytes<sizeof(T)>(base_bytes + offset_lanes[i], &lanes[i]);
4280 return Load(
d, lanes);
4283 template <
typename T,
size_t N,
typename Index>
4287 static_assert(
sizeof(T) ==
sizeof(Index),
"Must match for portability");
4289 alignas(16) Index index_lanes[
N];
4290 Store(index,
Rebind<Index, decltype(
d)>(), index_lanes);
4292 alignas(16) T lanes[
N];
4293 for (
size_t i = 0; i <
N; ++i) {
4294 lanes[i] = base[index_lanes[i]];
4296 return Load(
d, lanes);
4304 template <
typename T>
4308 template <
typename T>
4313 template <
typename T>
4320 template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
4324 template <
typename T>
4329 template <
typename T>
4336 #if HWY_ARCH_ARM_A64
4341 return Vec128<int32_t>(vdupq_n_s32(vaddvq_s32(
v.raw)));
4344 return Vec128<float>(vdupq_n_f32(vaddvq_f32(
v.raw)));
4347 return Vec128<uint64_t>(vdupq_n_u64(vaddvq_u64(
v.raw)));
4350 return Vec128<int64_t>(vdupq_n_s64(vaddvq_s64(
v.raw)));
4353 return Vec128<double>(vdupq_n_f64(vaddvq_f64(
v.raw)));
4358 uint32x4x2_t v0 = vuzpq_u32(
v.raw,
v.raw);
4359 uint32x4_t c0 = vaddq_u32(v0.val[0], v0.val[1]);
4360 uint32x4x2_t v1 = vuzpq_u32(c0, c0);
4364 int32x4x2_t v0 = vuzpq_s32(
v.raw,
v.raw);
4365 int32x4_t c0 = vaddq_s32(v0.val[0], v0.val[1]);
4366 int32x4x2_t v1 = vuzpq_s32(c0, c0);
4370 float32x4x2_t v0 = vuzpq_f32(
v.raw,
v.raw);
4371 float32x4_t c0 = vaddq_f32(v0.val[0], v0.val[1]);
4372 float32x4x2_t v1 = vuzpq_f32(c0, c0);
4383 template <
typename T>
4389 return Min(v20_31_20_31, v31_20_31_20);
4391 template <
typename T>
4397 return Max(v20_31_20_31, v31_20_31_20);
4401 template <
typename T>
4405 return Min(v10, v01);
4407 template <
typename T>
4411 return Max(v10, v01);
4415 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2), HWY_IF_GE32(T, N)>
4419 const auto odd = ShiftRight<16>(
BitCast(d32,
v));
4424 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2), HWY_IF_GE32(T, N)>
4428 const auto odd = ShiftRight<16>(
BitCast(d32,
v));
4436 template <
typename T,
size_t N>
4440 template <
typename T,
size_t N>
4444 template <
typename T,
size_t N>
4456 template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
4461 template <
typename T>
4466 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 1)>
4471 const auto vmask_bits =
Set64(du, mask_bits);
4474 alignas(16) constexpr uint8_t kRep8[16] = {0, 0, 0, 0, 0, 0, 0, 0,
4475 1, 1, 1, 1, 1, 1, 1, 1};
4478 alignas(16) constexpr uint8_t kBit[16] = {1, 2, 4, 8, 16, 32, 64, 128,
4479 1, 2, 4, 8, 16, 32, 64, 128};
4483 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2)>
4486 alignas(16) constexpr uint16_t kBit[8] = {1, 2, 4, 8, 16, 32, 64, 128};
4487 const auto vmask_bits =
Set(du,
static_cast<uint16_t
>(mask_bits));
4491 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
4494 alignas(16) constexpr uint32_t kBit[8] = {1, 2, 4, 8};
4495 const auto vmask_bits =
Set(du,
static_cast<uint32_t
>(mask_bits));
4499 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 8)>
4502 alignas(16) constexpr uint64_t kBit[8] = {1, 2};
4509 template <
typename T,
size_t N, HWY_IF_LE128(T, N)>
4512 uint64_t mask_bits = 0;
4521 template <
typename T>
4524 alignas(16) constexpr uint8_t kSliceLanes[16] = {
4525 1, 2, 4, 8, 0x10, 0x20, 0x40, 0x80, 1, 2, 4, 8, 0x10, 0x20, 0x40, 0x80,
4531 #if HWY_ARCH_ARM_A64
4533 const uint8x8_t x2 = vget_low_u8(vpaddq_u8(values.
raw, values.
raw));
4534 const uint8x8_t x4 = vpadd_u8(x2, x2);
4535 const uint8x8_t x8 = vpadd_u8(x4, x4);
4536 return vget_lane_u64(vreinterpret_u64_u8(x8), 0);
4539 const uint16x8_t x2 = vpaddlq_u8(values.
raw);
4540 const uint32x4_t x4 = vpaddlq_u16(x2);
4541 const uint64x2_t x8 = vpaddlq_u32(x4);
4542 return (vgetq_lane_u64(x8, 1) << 8) | vgetq_lane_u64(x8, 0);
4546 template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
4551 alignas(8) constexpr uint8_t kSliceLanes[8] = {1, 2, 4, 8,
4552 0x10, 0x20, 0x40, 0x80};
4558 #if HWY_ARCH_ARM_A64
4559 return vaddv_u8(values.
raw);
4561 const uint16x4_t x2 = vpaddl_u8(values.
raw);
4562 const uint32x2_t x4 = vpaddl_u16(x2);
4563 const uint64x1_t x8 = vpaddl_u32(x4);
4564 return vget_lane_u64(x8, 0);
4568 template <
typename T>
4571 alignas(16) constexpr uint16_t kSliceLanes[8] = {1, 2, 4, 8,
4572 0x10, 0x20, 0x40, 0x80};
4577 #if HWY_ARCH_ARM_A64
4578 return vaddvq_u16(values.
raw);
4580 const uint32x4_t x2 = vpaddlq_u16(values.
raw);
4581 const uint64x2_t x4 = vpaddlq_u32(x2);
4582 return vgetq_lane_u64(x4, 0) + vgetq_lane_u64(x4, 1);
4586 template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
4591 alignas(8) constexpr uint16_t kSliceLanes[4] = {1, 2, 4, 8};
4596 #if HWY_ARCH_ARM_A64
4597 return vaddv_u16(values.
raw);
4599 const uint32x2_t x2 = vpaddl_u16(values.
raw);
4600 const uint64x1_t x4 = vpaddl_u32(x2);
4601 return vget_lane_u64(x4, 0);
4605 template <
typename T>
4608 alignas(16) constexpr uint32_t kSliceLanes[4] = {1, 2, 4, 8};
4613 #if HWY_ARCH_ARM_A64
4614 return vaddvq_u32(values.
raw);
4616 const uint64x2_t x2 = vpaddlq_u32(values.
raw);
4617 return vgetq_lane_u64(x2, 0) + vgetq_lane_u64(x2, 1);
4621 template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
4626 alignas(8) constexpr uint32_t kSliceLanes[2] = {1, 2};
4631 #if HWY_ARCH_ARM_A64
4632 return vaddv_u32(values.
raw);
4634 const uint64x1_t x2 = vpaddl_u32(values.
raw);
4635 return vget_lane_u64(x2, 0);
4639 template <
typename T>
4641 alignas(16) constexpr uint64_t kSliceLanes[2] = {1, 2};
4646 #if HWY_ARCH_ARM_A64
4647 return vaddvq_u64(values.
raw);
4649 return vgetq_lane_u64(values.
raw, 0) + vgetq_lane_u64(values.
raw, 1);
4653 template <
typename T>
4659 return vget_lane_u64(values.
raw, 0);
4663 template <
typename T,
size_t N>
4665 return ((
N *
sizeof(T)) >= 8) ? bits : (bits & ((1ull <<
N) - 1));
4668 template <
typename T,
size_t N>
4679 template <
typename T>
4682 const int8x16_t ones =
4685 #if HWY_ARCH_ARM_A64
4686 return static_cast<size_t>(vaddvq_s8(ones));
4688 const int16x8_t x2 = vpaddlq_s8(ones);
4689 const int32x4_t x4 = vpaddlq_s16(x2);
4690 const int64x2_t x8 = vpaddlq_s32(x4);
4691 return static_cast<size_t>(vgetq_lane_s64(x8, 0) + vgetq_lane_s64(x8, 1));
4694 template <
typename T>
4697 const int16x8_t ones =
4700 #if HWY_ARCH_ARM_A64
4701 return static_cast<size_t>(vaddvq_s16(ones));
4703 const int32x4_t x2 = vpaddlq_s16(ones);
4704 const int64x2_t x4 = vpaddlq_s32(x2);
4705 return static_cast<size_t>(vgetq_lane_s64(x4, 0) + vgetq_lane_s64(x4, 1));
4709 template <
typename T>
4712 const int32x4_t ones =
4715 #if HWY_ARCH_ARM_A64
4716 return static_cast<size_t>(vaddvq_s32(ones));
4718 const int64x2_t x2 = vpaddlq_s32(ones);
4719 return static_cast<size_t>(vgetq_lane_s64(x2, 0) + vgetq_lane_s64(x2, 1));
4723 template <
typename T>
4725 #if HWY_ARCH_ARM_A64
4727 const int64x2_t ones =
4729 return static_cast<size_t>(vaddvq_s64(ones));
4733 const uint64x2_t ones = vshrq_n_u64(mask_u.raw, 63);
4734 return static_cast<size_t>(vgetq_lane_u64(ones, 0) + vgetq_lane_u64(ones, 1));
4741 template <
typename T>
4747 template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
4752 template <
typename T,
size_t N>
4760 template <
typename T,
size_t N>
4764 const size_t kNumBytes = (
N + 7) / 8;
4765 CopyBytes<kNumBytes>(&mask_bits, bits);
4770 template <
typename T>
4772 #if HWY_ARCH_ARM_A64
4775 return (vmaxvq_u32(m32.raw) == 0);
4778 uint32x2_t a = vqmovn_u64(v64.raw);
4779 return vget_lane_u64(vreinterpret_u64_u32(a), 0) == 0;
4784 template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
4789 template <
typename T,
size_t N>
4796 template <
typename T>
4805 const uint8_t* bytes) {
4807 vld1q_dup_u64(
reinterpret_cast<const uint64_t*
>(bytes))));
4811 template <
size_t N, HWY_IF_LE64(u
int8_t, N)>
4813 const uint8_t* bytes) {
4814 return Load(
d, bytes);
4817 template <
typename T,
size_t N>
4819 const uint64_t mask_bits) {
4833 alignas(16) constexpr uint8_t table[256 * 8] = {
4834 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14,
4835 2, 0, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14,
4836 4, 0, 2, 6, 8, 10, 12, 14, 0, 4, 2, 6, 8, 10, 12, 14,
4837 2, 4, 0, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14,
4838 6, 0, 2, 4, 8, 10, 12, 14, 0, 6, 2, 4, 8, 10, 12, 14,
4839 2, 6, 0, 4, 8, 10, 12, 14, 0, 2, 6, 4, 8, 10, 12, 14,
4840 4, 6, 0, 2, 8, 10, 12, 14, 0, 4, 6, 2, 8, 10, 12, 14,
4841 2, 4, 6, 0, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14,
4842 8, 0, 2, 4, 6, 10, 12, 14, 0, 8, 2, 4, 6, 10, 12, 14,
4843 2, 8, 0, 4, 6, 10, 12, 14, 0, 2, 8, 4, 6, 10, 12, 14,
4844 4, 8, 0, 2, 6, 10, 12, 14, 0, 4, 8, 2, 6, 10, 12, 14,
4845 2, 4, 8, 0, 6, 10, 12, 14, 0, 2, 4, 8, 6, 10, 12, 14,
4846 6, 8, 0, 2, 4, 10, 12, 14, 0, 6, 8, 2, 4, 10, 12, 14,
4847 2, 6, 8, 0, 4, 10, 12, 14, 0, 2, 6, 8, 4, 10, 12, 14,
4848 4, 6, 8, 0, 2, 10, 12, 14, 0, 4, 6, 8, 2, 10, 12, 14,
4849 2, 4, 6, 8, 0, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14,
4850 10, 0, 2, 4, 6, 8, 12, 14, 0, 10, 2, 4, 6, 8, 12, 14,
4851 2, 10, 0, 4, 6, 8, 12, 14, 0, 2, 10, 4, 6, 8, 12, 14,
4852 4, 10, 0, 2, 6, 8, 12, 14, 0, 4, 10, 2, 6, 8, 12, 14,
4853 2, 4, 10, 0, 6, 8, 12, 14, 0, 2, 4, 10, 6, 8, 12, 14,
4854 6, 10, 0, 2, 4, 8, 12, 14, 0, 6, 10, 2, 4, 8, 12, 14,
4855 2, 6, 10, 0, 4, 8, 12, 14, 0, 2, 6, 10, 4, 8, 12, 14,
4856 4, 6, 10, 0, 2, 8, 12, 14, 0, 4, 6, 10, 2, 8, 12, 14,
4857 2, 4, 6, 10, 0, 8, 12, 14, 0, 2, 4, 6, 10, 8, 12, 14,
4858 8, 10, 0, 2, 4, 6, 12, 14, 0, 8, 10, 2, 4, 6, 12, 14,
4859 2, 8, 10, 0, 4, 6, 12, 14, 0, 2, 8, 10, 4, 6, 12, 14,
4860 4, 8, 10, 0, 2, 6, 12, 14, 0, 4, 8, 10, 2, 6, 12, 14,
4861 2, 4, 8, 10, 0, 6, 12, 14, 0, 2, 4, 8, 10, 6, 12, 14,
4862 6, 8, 10, 0, 2, 4, 12, 14, 0, 6, 8, 10, 2, 4, 12, 14,
4863 2, 6, 8, 10, 0, 4, 12, 14, 0, 2, 6, 8, 10, 4, 12, 14,
4864 4, 6, 8, 10, 0, 2, 12, 14, 0, 4, 6, 8, 10, 2, 12, 14,
4865 2, 4, 6, 8, 10, 0, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14,
4866 12, 0, 2, 4, 6, 8, 10, 14, 0, 12, 2, 4, 6, 8, 10, 14,
4867 2, 12, 0, 4, 6, 8, 10, 14, 0, 2, 12, 4, 6, 8, 10, 14,
4868 4, 12, 0, 2, 6, 8, 10, 14, 0, 4, 12, 2, 6, 8, 10, 14,
4869 2, 4, 12, 0, 6, 8, 10, 14, 0, 2, 4, 12, 6, 8, 10, 14,
4870 6, 12, 0, 2, 4, 8, 10, 14, 0, 6, 12, 2, 4, 8, 10, 14,
4871 2, 6, 12, 0, 4, 8, 10, 14, 0, 2, 6, 12, 4, 8, 10, 14,
4872 4, 6, 12, 0, 2, 8, 10, 14, 0, 4, 6, 12, 2, 8, 10, 14,
4873 2, 4, 6, 12, 0, 8, 10, 14, 0, 2, 4, 6, 12, 8, 10, 14,
4874 8, 12, 0, 2, 4, 6, 10, 14, 0, 8, 12, 2, 4, 6, 10, 14,
4875 2, 8, 12, 0, 4, 6, 10, 14, 0, 2, 8, 12, 4, 6, 10, 14,
4876 4, 8, 12, 0, 2, 6, 10, 14, 0, 4, 8, 12, 2, 6, 10, 14,
4877 2, 4, 8, 12, 0, 6, 10, 14, 0, 2, 4, 8, 12, 6, 10, 14,
4878 6, 8, 12, 0, 2, 4, 10, 14, 0, 6, 8, 12, 2, 4, 10, 14,
4879 2, 6, 8, 12, 0, 4, 10, 14, 0, 2, 6, 8, 12, 4, 10, 14,
4880 4, 6, 8, 12, 0, 2, 10, 14, 0, 4, 6, 8, 12, 2, 10, 14,
4881 2, 4, 6, 8, 12, 0, 10, 14, 0, 2, 4, 6, 8, 12, 10, 14,
4882 10, 12, 0, 2, 4, 6, 8, 14, 0, 10, 12, 2, 4, 6, 8, 14,
4883 2, 10, 12, 0, 4, 6, 8, 14, 0, 2, 10, 12, 4, 6, 8, 14,
4884 4, 10, 12, 0, 2, 6, 8, 14, 0, 4, 10, 12, 2, 6, 8, 14,
4885 2, 4, 10, 12, 0, 6, 8, 14, 0, 2, 4, 10, 12, 6, 8, 14,
4886 6, 10, 12, 0, 2, 4, 8, 14, 0, 6, 10, 12, 2, 4, 8, 14,
4887 2, 6, 10, 12, 0, 4, 8, 14, 0, 2, 6, 10, 12, 4, 8, 14,
4888 4, 6, 10, 12, 0, 2, 8, 14, 0, 4, 6, 10, 12, 2, 8, 14,
4889 2, 4, 6, 10, 12, 0, 8, 14, 0, 2, 4, 6, 10, 12, 8, 14,
4890 8, 10, 12, 0, 2, 4, 6, 14, 0, 8, 10, 12, 2, 4, 6, 14,
4891 2, 8, 10, 12, 0, 4, 6, 14, 0, 2, 8, 10, 12, 4, 6, 14,
4892 4, 8, 10, 12, 0, 2, 6, 14, 0, 4, 8, 10, 12, 2, 6, 14,
4893 2, 4, 8, 10, 12, 0, 6, 14, 0, 2, 4, 8, 10, 12, 6, 14,
4894 6, 8, 10, 12, 0, 2, 4, 14, 0, 6, 8, 10, 12, 2, 4, 14,
4895 2, 6, 8, 10, 12, 0, 4, 14, 0, 2, 6, 8, 10, 12, 4, 14,
4896 4, 6, 8, 10, 12, 0, 2, 14, 0, 4, 6, 8, 10, 12, 2, 14,
4897 2, 4, 6, 8, 10, 12, 0, 14, 0, 2, 4, 6, 8, 10, 12, 14,
4898 14, 0, 2, 4, 6, 8, 10, 12, 0, 14, 2, 4, 6, 8, 10, 12,
4899 2, 14, 0, 4, 6, 8, 10, 12, 0, 2, 14, 4, 6, 8, 10, 12,
4900 4, 14, 0, 2, 6, 8, 10, 12, 0, 4, 14, 2, 6, 8, 10, 12,
4901 2, 4, 14, 0, 6, 8, 10, 12, 0, 2, 4, 14, 6, 8, 10, 12,
4902 6, 14, 0, 2, 4, 8, 10, 12, 0, 6, 14, 2, 4, 8, 10, 12,
4903 2, 6, 14, 0, 4, 8, 10, 12, 0, 2, 6, 14, 4, 8, 10, 12,
4904 4, 6, 14, 0, 2, 8, 10, 12, 0, 4, 6, 14, 2, 8, 10, 12,
4905 2, 4, 6, 14, 0, 8, 10, 12, 0, 2, 4, 6, 14, 8, 10, 12,
4906 8, 14, 0, 2, 4, 6, 10, 12, 0, 8, 14, 2, 4, 6, 10, 12,
4907 2, 8, 14, 0, 4, 6, 10, 12, 0, 2, 8, 14, 4, 6, 10, 12,
4908 4, 8, 14, 0, 2, 6, 10, 12, 0, 4, 8, 14, 2, 6, 10, 12,
4909 2, 4, 8, 14, 0, 6, 10, 12, 0, 2, 4, 8, 14, 6, 10, 12,
4910 6, 8, 14, 0, 2, 4, 10, 12, 0, 6, 8, 14, 2, 4, 10, 12,
4911 2, 6, 8, 14, 0, 4, 10, 12, 0, 2, 6, 8, 14, 4, 10, 12,
4912 4, 6, 8, 14, 0, 2, 10, 12, 0, 4, 6, 8, 14, 2, 10, 12,
4913 2, 4, 6, 8, 14, 0, 10, 12, 0, 2, 4, 6, 8, 14, 10, 12,
4914 10, 14, 0, 2, 4, 6, 8, 12, 0, 10, 14, 2, 4, 6, 8, 12,
4915 2, 10, 14, 0, 4, 6, 8, 12, 0, 2, 10, 14, 4, 6, 8, 12,
4916 4, 10, 14, 0, 2, 6, 8, 12, 0, 4, 10, 14, 2, 6, 8, 12,
4917 2, 4, 10, 14, 0, 6, 8, 12, 0, 2, 4, 10, 14, 6, 8, 12,
4918 6, 10, 14, 0, 2, 4, 8, 12, 0, 6, 10, 14, 2, 4, 8, 12,
4919 2, 6, 10, 14, 0, 4, 8, 12, 0, 2, 6, 10, 14, 4, 8, 12,
4920 4, 6, 10, 14, 0, 2, 8, 12, 0, 4, 6, 10, 14, 2, 8, 12,
4921 2, 4, 6, 10, 14, 0, 8, 12, 0, 2, 4, 6, 10, 14, 8, 12,
4922 8, 10, 14, 0, 2, 4, 6, 12, 0, 8, 10, 14, 2, 4, 6, 12,
4923 2, 8, 10, 14, 0, 4, 6, 12, 0, 2, 8, 10, 14, 4, 6, 12,
4924 4, 8, 10, 14, 0, 2, 6, 12, 0, 4, 8, 10, 14, 2, 6, 12,
4925 2, 4, 8, 10, 14, 0, 6, 12, 0, 2, 4, 8, 10, 14, 6, 12,
4926 6, 8, 10, 14, 0, 2, 4, 12, 0, 6, 8, 10, 14, 2, 4, 12,
4927 2, 6, 8, 10, 14, 0, 4, 12, 0, 2, 6, 8, 10, 14, 4, 12,
4928 4, 6, 8, 10, 14, 0, 2, 12, 0, 4, 6, 8, 10, 14, 2, 12,
4929 2, 4, 6, 8, 10, 14, 0, 12, 0, 2, 4, 6, 8, 10, 14, 12,
4930 12, 14, 0, 2, 4, 6, 8, 10, 0, 12, 14, 2, 4, 6, 8, 10,
4931 2, 12, 14, 0, 4, 6, 8, 10, 0, 2, 12, 14, 4, 6, 8, 10,
4932 4, 12, 14, 0, 2, 6, 8, 10, 0, 4, 12, 14, 2, 6, 8, 10,
4933 2, 4, 12, 14, 0, 6, 8, 10, 0, 2, 4, 12, 14, 6, 8, 10,
4934 6, 12, 14, 0, 2, 4, 8, 10, 0, 6, 12, 14, 2, 4, 8, 10,
4935 2, 6, 12, 14, 0, 4, 8, 10, 0, 2, 6, 12, 14, 4, 8, 10,
4936 4, 6, 12, 14, 0, 2, 8, 10, 0, 4, 6, 12, 14, 2, 8, 10,
4937 2, 4, 6, 12, 14, 0, 8, 10, 0, 2, 4, 6, 12, 14, 8, 10,
4938 8, 12, 14, 0, 2, 4, 6, 10, 0, 8, 12, 14, 2, 4, 6, 10,
4939 2, 8, 12, 14, 0, 4, 6, 10, 0, 2, 8, 12, 14, 4, 6, 10,
4940 4, 8, 12, 14, 0, 2, 6, 10, 0, 4, 8, 12, 14, 2, 6, 10,
4941 2, 4, 8, 12, 14, 0, 6, 10, 0, 2, 4, 8, 12, 14, 6, 10,
4942 6, 8, 12, 14, 0, 2, 4, 10, 0, 6, 8, 12, 14, 2, 4, 10,
4943 2, 6, 8, 12, 14, 0, 4, 10, 0, 2, 6, 8, 12, 14, 4, 10,
4944 4, 6, 8, 12, 14, 0, 2, 10, 0, 4, 6, 8, 12, 14, 2, 10,
4945 2, 4, 6, 8, 12, 14, 0, 10, 0, 2, 4, 6, 8, 12, 14, 10,
4946 10, 12, 14, 0, 2, 4, 6, 8, 0, 10, 12, 14, 2, 4, 6, 8,
4947 2, 10, 12, 14, 0, 4, 6, 8, 0, 2, 10, 12, 14, 4, 6, 8,
4948 4, 10, 12, 14, 0, 2, 6, 8, 0, 4, 10, 12, 14, 2, 6, 8,
4949 2, 4, 10, 12, 14, 0, 6, 8, 0, 2, 4, 10, 12, 14, 6, 8,
4950 6, 10, 12, 14, 0, 2, 4, 8, 0, 6, 10, 12, 14, 2, 4, 8,
4951 2, 6, 10, 12, 14, 0, 4, 8, 0, 2, 6, 10, 12, 14, 4, 8,
4952 4, 6, 10, 12, 14, 0, 2, 8, 0, 4, 6, 10, 12, 14, 2, 8,
4953 2, 4, 6, 10, 12, 14, 0, 8, 0, 2, 4, 6, 10, 12, 14, 8,
4954 8, 10, 12, 14, 0, 2, 4, 6, 0, 8, 10, 12, 14, 2, 4, 6,
4955 2, 8, 10, 12, 14, 0, 4, 6, 0, 2, 8, 10, 12, 14, 4, 6,
4956 4, 8, 10, 12, 14, 0, 2, 6, 0, 4, 8, 10, 12, 14, 2, 6,
4957 2, 4, 8, 10, 12, 14, 0, 6, 0, 2, 4, 8, 10, 12, 14, 6,
4958 6, 8, 10, 12, 14, 0, 2, 4, 0, 6, 8, 10, 12, 14, 2, 4,
4959 2, 6, 8, 10, 12, 14, 0, 4, 0, 2, 6, 8, 10, 12, 14, 4,
4960 4, 6, 8, 10, 12, 14, 0, 2, 0, 4, 6, 8, 10, 12, 14, 2,
4961 2, 4, 6, 8, 10, 12, 14, 0, 0, 2, 4, 6, 8, 10, 12, 14};
4968 template <
typename T,
size_t N>
4970 const uint64_t mask_bits) {
4974 alignas(16) constexpr uint8_t packed_array[16 * 16] = {
4975 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
4976 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
4977 4, 5, 6, 7, 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15,
4978 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
4979 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15,
4980 0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15,
4981 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15,
4982 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
4983 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
4984 0, 1, 2, 3, 12, 13, 14, 15, 4, 5, 6, 7, 8, 9, 10, 11,
4985 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 8, 9, 10, 11,
4986 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 8, 9, 10, 11,
4987 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7,
4988 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7,
4989 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3,
4990 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
4993 return BitCast(
d,
Load(d8, packed_array + 16 * mask_bits));
4996 #if HWY_HAVE_INTEGER64 || HWY_HAVE_FLOAT64
4998 template <
typename T,
size_t N>
5000 const uint64_t mask_bits) {
5004 alignas(16) constexpr uint8_t packed_array[4 * 16] = {
5005 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
5006 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
5007 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7,
5008 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
5010 const Simd<T, N, 0>
d;
5012 return BitCast(
d,
Load(d8, packed_array + 16 * mask_bits));
5019 template <
typename T,
size_t N>
5022 detail::IdxFromBits<T, N>(
hwy::SizeTag<
sizeof(T)>(), mask_bits);
5030 template <
typename T,
size_t N>
5037 template <
typename T,
size_t N>
5040 uint64_t mask_bits = 0;
5041 constexpr
size_t kNumBytes = (
N + 7) / 8;
5042 CopyBytes<kNumBytes>(bits, &mask_bits);
5044 mask_bits &= (1ull <<
N) - 1;
5051 template <
typename T,
size_t N>
5060 template <
typename T,
size_t N>
5065 using TU =
TFromD<decltype(du)>;
5067 const size_t count =
PopCount(mask_bits);
5076 template <
typename T,
size_t N>
5080 uint64_t mask_bits = 0;
5081 constexpr
size_t kNumBytes = (
N + 7) / 8;
5082 CopyBytes<kNumBytes>(bits, &mask_bits);
5084 mask_bits &= (1ull <<
N) - 1;
5099 const uint8x16x3_t triple = {{v0.
raw, v1.
raw, v2.
raw}};
5100 vst3q_u8(unaligned, triple);
5107 const uint8x8x3_t triple = {{v0.
raw, v1.
raw, v2.
raw}};
5108 vst3_u8(unaligned, triple);
5112 template <
size_t N, HWY_IF_LE32(u
int8_t, N)>
5118 alignas(16) uint8_t buf[24];
5119 const uint8x8x3_t triple = {{v0.
raw, v1.
raw, v2.
raw}};
5120 vst3_u8(buf, triple);
5121 CopyBytes<N * 3>(buf, unaligned);
5133 const uint8x16x4_t quad = {{v0.
raw, v1.
raw, v2.
raw, v3.
raw}};
5134 vst4q_u8(unaligned, quad);
5142 const uint8x8x4_t quad = {{v0.
raw, v1.
raw, v2.
raw, v3.
raw}};
5143 vst4_u8(unaligned, quad);
5147 template <
size_t N, HWY_IF_LE32(u
int8_t, N)>
5154 alignas(16) uint8_t buf[32];
5155 const uint8x8x4_t quad = {{v0.
raw, v1.
raw, v2.
raw, v3.
raw}};
5157 CopyBytes<N * 4>(buf, unaligned);
5164 template <
size_t kLanes,
typename T,
size_t N>
5171 template <
typename T,
size_t N, HWY_IF_LE128(T, N)>
5174 static_assert(!IsSigned<T>() &&
sizeof(T) == 8,
"Use u64");
5330 #undef HWY_NEON_BUILD_ARG_1
5331 #undef HWY_NEON_BUILD_ARG_2
5332 #undef HWY_NEON_BUILD_ARG_3
5333 #undef HWY_NEON_BUILD_PARAM_1
5334 #undef HWY_NEON_BUILD_PARAM_2
5335 #undef HWY_NEON_BUILD_PARAM_3
5336 #undef HWY_NEON_BUILD_RET_1
5337 #undef HWY_NEON_BUILD_RET_2
5338 #undef HWY_NEON_BUILD_RET_3
5339 #undef HWY_NEON_BUILD_TPL_1
5340 #undef HWY_NEON_BUILD_TPL_2
5341 #undef HWY_NEON_BUILD_TPL_3
5342 #undef HWY_NEON_DEF_FUNCTION
5343 #undef HWY_NEON_DEF_FUNCTION_ALL_FLOATS
5344 #undef HWY_NEON_DEF_FUNCTION_ALL_TYPES
5345 #undef HWY_NEON_DEF_FUNCTION_INT_8
5346 #undef HWY_NEON_DEF_FUNCTION_INT_16
5347 #undef HWY_NEON_DEF_FUNCTION_INT_32
5348 #undef HWY_NEON_DEF_FUNCTION_INT_8_16_32
5349 #undef HWY_NEON_DEF_FUNCTION_INTS
5350 #undef HWY_NEON_DEF_FUNCTION_INTS_UINTS
5351 #undef HWY_NEON_DEF_FUNCTION_TPL
5352 #undef HWY_NEON_DEF_FUNCTION_UINT_8
5353 #undef HWY_NEON_DEF_FUNCTION_UINT_16
5354 #undef HWY_NEON_DEF_FUNCTION_UINT_32
5355 #undef HWY_NEON_DEF_FUNCTION_UINT_8_16_32
5356 #undef HWY_NEON_DEF_FUNCTION_UINTS
5357 #undef HWY_NEON_EVAL
#define HWY_NEON_DEF_FUNCTION_ALL_FLOATS(name, prefix, infix, args)
Definition: arm_neon-inl.h:168
#define HWY_NEON_DEF_FUNCTION_INTS(name, prefix, infix, args)
Definition: arm_neon-inl.h:191
#define HWY_NEON_DEF_FUNCTION_ALL_TYPES(name, prefix, infix, args)
Definition: arm_neon-inl.h:201
#define HWY_NEON_DEF_FUNCTION_INT_64(name, prefix, infix, args)
Definition: arm_neon-inl.h:147
#define HWY_NEON_DEF_FUNCTION_UINT_64(name, prefix, infix, args)
Definition: arm_neon-inl.h:142
#define HWY_NEON_DEF_FUNCTION_UINT_8(name, prefix, infix, args)
Definition: arm_neon-inl.h:100
#define HWY_NEON_DEF_FUNCTION_INTS_UINTS(name, prefix, infix, args)
Definition: arm_neon-inl.h:196
#define HWY_NEON_DEF_FUNCTION_UINT_32(name, prefix, infix, args)
Definition: arm_neon-inl.h:130
#define HWY_NEON_DEF_FUNCTION_UINT_8_16_32(name, prefix, infix, args)
Definition: arm_neon-inl.h:174
#define HWY_NEON_DEF_FUNCTION(type, size, name, prefix, infix, suffix, args)
Definition: arm_neon-inl.h:917
#define HWY_NEON_DEF_FUNCTION_UINT_16(name, prefix, infix, args)
Definition: arm_neon-inl.h:116
#define HWY_NEON_DEF_FUNCTION_INT_16(name, prefix, infix, args)
Definition: arm_neon-inl.h:123
#define HWY_NEON_DEF_FUNCTION_UINTS(name, prefix, infix, args)
Definition: arm_neon-inl.h:186
#define HWY_NEON_DEF_FUNCTION_INT_8(name, prefix, infix, args)
Definition: arm_neon-inl.h:108
#define HWY_NEON_DEF_FUNCTION_INT_32(name, prefix, infix, args)
Definition: arm_neon-inl.h:136
#define HWY_NEON_DEF_FUNCTION_FLOAT_32(name, prefix, infix, args)
Definition: arm_neon-inl.h:152
#define HWY_IF_FLOAT(T)
Definition: base.h:307
#define HWY_RESTRICT
Definition: base.h:63
#define HWY_DIAGNOSTICS(tokens)
Definition: base.h:71
#define HWY_IF_LE64(T, N)
Definition: base.h:297
#define HWY_API
Definition: base.h:122
#define HWY_MIN(a, b)
Definition: base.h:127
#define HWY_INLINE
Definition: base.h:64
#define HWY_DIAGNOSTICS_OFF(msc, gcc)
Definition: base.h:72
#define HWY_DASSERT(condition)
Definition: base.h:193
#define HWY_ASSERT(condition)
Definition: base.h:147
Definition: arm_neon-inl.h:529
HWY_INLINE Mask128()
Definition: arm_neon-inl.h:534
Mask128(const Mask128 &)=default
HWY_INLINE Mask128(const Raw raw)
Definition: arm_neon-inl.h:537
Raw raw
Definition: arm_neon-inl.h:539
typename detail::Raw128< MakeUnsigned< T >, N >::type Raw
Definition: arm_neon-inl.h:531
Mask128 & operator=(const Mask128 &)=default
Definition: arm_neon-inl.h:485
HWY_INLINE Vec128 & operator&=(const Vec128 other)
Definition: arm_neon-inl.h:508
HWY_INLINE Vec128 & operator|=(const Vec128 other)
Definition: arm_neon-inl.h:511
HWY_INLINE Vec128()
Definition: arm_neon-inl.h:489
HWY_INLINE Vec128 & operator/=(const Vec128 other)
Definition: arm_neon-inl.h:499
HWY_INLINE Vec128(const Raw raw)
Definition: arm_neon-inl.h:492
HWY_INLINE Vec128 & operator^=(const Vec128 other)
Definition: arm_neon-inl.h:514
HWY_INLINE Vec128 & operator*=(const Vec128 other)
Definition: arm_neon-inl.h:496
Vec128(const Vec128 &)=default
typename detail::Raw128< T, N >::type Raw
Definition: arm_neon-inl.h:486
Raw raw
Definition: arm_neon-inl.h:518
HWY_INLINE Vec128 & operator+=(const Vec128 other)
Definition: arm_neon-inl.h:502
Vec128 & operator=(const Vec128 &)=default
HWY_INLINE Vec128 & operator-=(const Vec128 other)
Definition: arm_neon-inl.h:505
HWY_INLINE Vec128< T > PopulationCount(hwy::SizeTag< 1 >, Vec128< T > v)
Definition: arm_neon-inl.h:1538
HWY_INLINE Mask128< float, N > UseInt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2860
HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag< 1 >, const Mask128< T > mask)
Definition: arm_neon-inl.h:4522
HWY_INLINE Mask128< T, N > Or(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:899
HWY_INLINE Vec128< T, N > Set64(Simd< T, N, 0 >, uint64_t mask_bits)
Definition: arm_neon-inl.h:4457
HWY_INLINE Mask128< T, N > MaskFromVec(hwy::SizeTag< 1 >, const Vec128< T, N > v)
Definition: x86_128-inl.h:1357
HWY_INLINE Vec128< float > ReciprocalSqrtStep(const Vec128< float > root, const Vec128< float > recip)
Definition: arm_neon-inl.h:1390
HWY_INLINE Mask128< T, N > TestBit(hwy::SizeTag< 1 >, const Vec128< T, N > v, const Vec128< T, N > bit)
Definition: x86_128-inl.h:1193
HWY_INLINE Vec128< T, 1 > MinOfLanes(hwy::SizeTag< sizeof(T)>, const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:4309
HWY_INLINE Vec128< uint8_t > Load8Bytes(Full128< uint8_t >, const uint8_t *bytes)
Definition: arm_neon-inl.h:4804
HWY_NEON_DEF_FUNCTION_ALL_FLOATS(BitCastToByte, vreinterpret, _u8_, HWY_CAST_TO_U8) template< size_t N > HWY_INLINE Vec128< uint8_t
HWY_INLINE Vec128< float > ReciprocalNewtonRaphsonStep(const Vec128< float > recip, const Vec128< float > divisor)
Definition: arm_neon-inl.h:1239
HWY_INLINE size_t CountTrue(hwy::SizeTag< 1 >, const Mask128< T > mask)
Definition: arm_neon-inl.h:4680
HWY_INLINE Vec128< T, N > Compress(Vec128< T, N > v, const uint64_t mask_bits)
Definition: arm_neon-inl.h:5020
HWY_INLINE Vec128< T, N > IfThenElseZero(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > yes)
Definition: x86_128-inl.h:728
HWY_INLINE Mask128< T, N > And(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:825
HWY_INLINE Vec128< T, 1 > MaxOfLanes(hwy::SizeTag< sizeof(T)>, const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:4314
constexpr uint64_t OnlyActive(uint64_t bits)
Definition: arm_neon-inl.h:4664
HWY_INLINE Vec128< uint8_t, N > BitCastFromByte(Simd< uint8_t, N, 0 >, Vec128< uint8_t, N > v)
Definition: arm_neon-inl.h:601
HWY_INLINE Vec128< uint8_t, N > BitCastToByte(Vec128< uint8_t, N > v)
Definition: arm_neon-inl.h:574
HWY_API Vec128< uint64_t > InterleaveUpper(const Vec128< uint64_t > a, const Vec128< uint64_t > b)
Definition: arm_neon-inl.h:3635
HWY_INLINE Mask128< T, N > LoadMaskBits(Simd< T, N, 0 > d, uint64_t mask_bits)
Definition: arm_neon-inl.h:4467
Mask128< T, N > ShiftMaskLeft(Mask128< T, N > m)
Definition: arm_neon-inl.h:5165
HWY_INLINE Vec128< T, N > IdxFromBits(hwy::SizeTag< 2 >, const uint64_t mask_bits)
Definition: arm_neon-inl.h:4818
HWY_INLINE Vec128< T, 1 > SumOfLanes(const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:4305
HWY_API Mask1< T > operator<=(const Vec1< T > a, const Vec1< T > b)
Definition: scalar-inl.h:842
d
Definition: rvv-inl.h:1656
HWY_API Vec128< T, N > CopySign(const Vec128< T, N > magn, const Vec128< T, N > sign)
Definition: arm_neon-inl.h:1648
HWY_API Vec128< T, N > OddEvenBlocks(Vec128< T, N >, Vec128< T, N > even)
Definition: arm_neon-inl.h:4038
HWY_API Vec< D > SignBit(D d)
Definition: generic_ops-inl.h:61
HWY_API Vec128< uint8_t > Combine(Full128< uint8_t >, Vec64< uint8_t > hi, Vec64< uint8_t > lo)
Definition: arm_neon-inl.h:3709
HWY_API Mask128< TTo, N > RebindMask(Simd< TTo, N, 0 > dto, Mask128< TFrom, N > m)
Definition: arm_neon-inl.h:1688
HWY_API Vec128< T, N > DupOdd(Vec128< T, N > v)
Definition: arm_neon-inl.h:4003
HWY_API Vec128< float > ApproximateReciprocal(const Vec128< float > v)
Definition: arm_neon-inl.h:1225
HWY_API VFromD< DW > ZipLower(V a, V b)
Definition: arm_neon-inl.h:3672
HWY_API uint8_t GetLane(const Vec128< uint8_t, 16 > v)
Definition: arm_neon-inl.h:767
HWY_API Vec128< T, N > PopulationCount(Vec128< T, N > v)
Definition: arm_neon-inl.h:1595
HWY_API Vec128< T > Shuffle1032(const Vec128< T > v)
Definition: arm_neon-inl.h:3531
HWY_NEON_DEF_FUNCTION_INT_8_16_32(Neg, vneg, _, 1) HWY_API Vec64< int64_t > Neg(const Vec64< int64_t > v)
Definition: arm_neon-inl.h:894
HWY_API auto Lt(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:5252
HWY_API Vec128< uint64_t > InterleaveLower(const Vec128< uint64_t > a, const Vec128< uint64_t > b)
Definition: arm_neon-inl.h:3581
HWY_API Vec128< T > Shuffle2103(const Vec128< T > v)
Definition: arm_neon-inl.h:3547
HWY_API Vec128< float, N > Round(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2878
HWY_API Vec128< T, N > ZeroExtendVector(Simd< T, N, 0 > d, Vec128< T, N/2 > lo)
Definition: arm_neon-inl.h:3769
HWY_API auto Eq(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:5244
HWY_API Vec128< T, N > ShiftLeftSame(const Vec128< T, N > v, int bits)
Definition: arm_neon-inl.h:1122
HWY_API V128 CombineShiftRightBytes(Full128< T > d, V128 hi, V128 lo)
Definition: arm_neon-inl.h:2999
HWY_API auto Gt(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:5257
HWY_API Vec128< T, N > ShiftLeftLanes(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3102
_
Definition: rvv-inl.h:1405
HWY_API Mask128< T, N > FirstN(const Simd< T, N, 0 > d, size_t num)
Definition: arm_neon-inl.h:1896
HWY_API size_t StoreMaskBits(Simd< T, N, 0 >, const Mask128< T, N > mask, uint8_t *bits)
Definition: arm_neon-inl.h:4761
HWY_API Vec128< float, N > MulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition: arm_neon-inl.h:1290
HWY_API void Stream(const Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2416
HWY_API Vec128< int8_t > Abs(const Vec128< int8_t > v)
Definition: arm_neon-inl.h:1604
Repartition< MakeWide< TFromD< D > >, D > RepartitionToWide
Definition: ops/shared-inl.h:210
HWY_API bool AllTrue(const Simd< T, N, 0 > d, const Mask128< T, N > m)
Definition: arm_neon-inl.h:4790
HWY_API Vec128< T, N > SumOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4437
HWY_API Vec128< T, N > BroadcastSignBit(const Vec128< T, N > v)
Definition: arm_neon-inl.h:1665
V Shl(V a, V b)
Definition: arm_neon-inl.h:5235
HWY_API auto Ge(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:5261
HWY_API Vec128< float > AbsDiff(const Vec128< float > a, const Vec128< float > b)
Definition: arm_neon-inl.h:1264
StoreInterleaved3
Definition: rvv-inl.h:1405
HWY_API Vec128< uint32_t > ConcatOdd(Full128< uint32_t >, Vec128< uint32_t > hi, Vec128< uint32_t > lo)
Definition: arm_neon-inl.h:3903
HWY_API Vec128< uint64_t, N > Min(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:1957
HWY_API Vec256< uint64_t > CLMulLower(Vec256< uint64_t > a, Vec256< uint64_t > b)
Definition: x86_256-inl.h:3972
HWY_API Vec128< uint64_t, N > Max(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:1995
HWY_API Mask128< T, N > MaskFromVec(const Vec128< T, N > v)
Definition: arm_neon-inl.h:1675
HWY_API Vec128< T, N > ConcatUpperUpper(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:3842
Rebind< MakeUnsigned< TFromD< D > >, D > RebindToUnsigned
Definition: ops/shared-inl.h:201
HWY_API Vec128< T, N > GatherIndex(const Simd< T, N, 0 > d, const T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition: arm_neon-inl.h:4284
HWY_API Vec128< uint8_t > operator<<(const Vec128< uint8_t > v, const Vec128< uint8_t > bits)
Definition: arm_neon-inl.h:953
HWY_INLINE Vec128< uint64_t > MulOdd(Vec128< uint64_t > a, Vec128< uint64_t > b)
Definition: arm_neon-inl.h:4159
HWY_API Mask128< T, N > operator>(Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:1885
HWY_API Vec1< uint8_t > SaturatedAdd(const Vec1< uint8_t > a, const Vec1< uint8_t > b)
Definition: scalar-inl.h:457
sseg3 sseg3 StoreInterleaved4
Definition: rvv-inl.h:1428
HWY_API Vec128< T > Shuffle0321(const Vec128< T > v)
Definition: arm_neon-inl.h:3541
HWY_API Vec128< uint32_t > ConcatEven(Full128< uint32_t >, Vec128< uint32_t > hi, Vec128< uint32_t > lo)
Definition: arm_neon-inl.h:3945
HWY_API Vec128< T, N > Load(Simd< T, N, 0 > d, const T *HWY_RESTRICT p)
Definition: arm_neon-inl.h:2205
HWY_API Vec128< T, N > ConcatLowerUpper(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:3869
HWY_API Vec128< int64_t > Neg(const Vec128< int64_t > v)
Definition: arm_neon-inl.h:904
HWY_API Vec128< T, N > Zero(Simd< T, N, 0 > d)
Definition: arm_neon-inl.h:733
HWY_API Vec256< uint8_t > AESRound(Vec256< uint8_t > state, Vec256< uint8_t > round_key)
Definition: x86_256-inl.h:3947
HWY_API Vec128< T, N > operator&(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1513
HWY_API Vec128< T, N > operator|(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1518
HWY_API Vec128< int64_t > MulEven(Vec128< int32_t > a, Vec128< int32_t > b)
Definition: arm_neon-inl.h:4119
HWY_API size_t Lanes(Simd< T, N, kPow2 > d)
Definition: arm_sve-inl.h:218
HWY_API Vec128< bfloat16_t, 2 *N > ReorderDemote2To(Simd< bfloat16_t, 2 *N, 0 > dbf16, Vec128< float, N > a, Vec128< float, N > b)
Definition: arm_neon-inl.h:4060
HWY_API Vec128< T, N > MaskedLoad(Mask128< T, N > m, Simd< T, N, 0 > d, const T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2210
HWY_API Vec128< T, N/2 > LowerHalf(const Vec128< T, N > v)
Definition: arm_neon-inl.h:2952
Rebind< MakeSigned< TFromD< D > >, D > RebindToSigned
Definition: ops/shared-inl.h:199
HWY_API Mask1< T > operator==(const Vec1< T > a, const Vec1< T > b)
Definition: scalar-inl.h:817
HWY_API Vec32< uint8_t > U8FromU32(const Vec128< uint32_t > v)
Definition: arm_neon-inl.h:2748
HWY_API Vec128< float, N > ReorderWidenMulAccumulate(Simd< float, N, 0 > df32, Vec128< bfloat16_t, 2 *N > a, Vec128< bfloat16_t, 2 *N > b, const Vec128< float, N > sum0, Vec128< float, N > &sum1)
Definition: arm_neon-inl.h:3688
HWY_API Vec128< T, N > IfVecThenElse(Vec128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition: arm_neon-inl.h:1505
HWY_API Vec128< T, N > operator^(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1523
HWY_API void BlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT p)
Definition: arm_neon-inl.h:2402
HWY_API size_t CountTrue(Full128< T >, const Mask128< T > mask)
Definition: arm_neon-inl.h:4742
HWY_API Vec128< T, N > VecFromMask(Simd< T, N, 0 > d, const Mask128< T, N > v)
Definition: arm_neon-inl.h:1681
HWY_API intptr_t FindFirstTrue(const Simd< T, N, 0 >, const Mask128< T, N > mask)
Definition: arm_neon-inl.h:4753
HWY_API Vec128< float > ConvertTo(Full128< float >, const Vec128< int32_t > v)
Definition: arm_neon-inl.h:2788
HWY_API Vec128< T, N > DupEven(Vec128< T, N > v)
Definition: arm_neon-inl.h:3987
HWY_API Vec128< T, N > IfThenElseZero(const Mask128< T, N > mask, const Vec128< T, N > yes)
Definition: arm_neon-inl.h:1711
HWY_API V Add(V a, V b)
Definition: arm_neon-inl.h:5217
HWY_API bool AllFalse(const Full128< T > d, const Mask128< T > m)
Definition: arm_neon-inl.h:4771
HWY_API Vec128< T, N > TableLookupLanes(Vec128< T, N > v, Indices128< T, N > idx)
Definition: arm_neon-inl.h:3419
HWY_API Vec128< T, N > Reverse4(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3490
HWY_API Vec128< float, N > Floor(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2909
HWY_API Vec128< float, N > MulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition: arm_neon-inl.h:1344
HWY_API Vec128< T, N > CopySignToAbs(const Vec128< T, N > abs, const Vec128< T, N > sign)
Definition: arm_neon-inl.h:1656
HWY_API Vec128< T, N > ZeroIfNegative(Vec128< T, N > v)
Definition: arm_neon-inl.h:1735
HWY_API void StoreU(const Vec128< uint8_t > v, Full128< uint8_t >, uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:2224
HWY_API Mask128< T, N > operator!=(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1815
HWY_API Vec128< float, N > Ceil(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2895
HWY_API Indices128< T, N > IndicesFromVec(Simd< T, N, 0 > d, Vec128< TI, N > vec)
Definition: arm_neon-inl.h:3373
HWY_API Vec128< T, N > SwapAdjacentBlocks(Vec128< T, N > v)
Definition: arm_neon-inl.h:4045
HWY_API Vec128< T, N > ShiftLeftBytes(Simd< T, N, 0 >, Vec128< T, N > v)
Definition: arm_neon-inl.h:3091
HWY_INLINE VFromD< D > Min128(D d, const VFromD< D > a, const VFromD< D > b)
Definition: arm_neon-inl.h:5203
HWY_API Vec128< T, N > Reverse2(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3461
svuint16_t Set(Simd< bfloat16_t, N, kPow2 > d, bfloat16_t arg)
Definition: arm_sve-inl.h:282
HWY_API Vec128< T, N > Reverse8(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3513
HWY_API Vec128< T, N > MaxOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4445
HWY_API Mask1< T > operator<(const Vec1< T > a, const Vec1< T > b)
Definition: scalar-inl.h:833
Vec128< T, N > Iota(const Simd< T, N, 0 > d, const T2 first)
Definition: arm_neon-inl.h:757
HWY_API Vec1< uint8_t > AverageRound(const Vec1< uint8_t > a, const Vec1< uint8_t > b)
Definition: scalar-inl.h:510
HWY_API Vec1< T > ShiftRight(const Vec1< T > v)
Definition: scalar-inl.h:345
HWY_API Mask128< T, N > LoadMaskBits(Simd< T, N, 0 > d, const uint8_t *HWY_RESTRICT bits)
Definition: arm_neon-inl.h:4510
HWY_API Vec256< uint64_t > CLMulUpper(Vec256< uint64_t > a, Vec256< uint64_t > b)
Definition: x86_256-inl.h:3983
HWY_API Vec128< T > Shuffle01(const Vec128< T > v)
Definition: arm_neon-inl.h:3535
HWY_API Mask128< uint64_t, N > TestBit(Vec128< uint64_t, N > v, Vec128< uint64_t, N > bit)
Definition: arm_neon-inl.h:1917
HWY_API Vec256< uint8_t > AESLastRound(Vec256< uint8_t > state, Vec256< uint8_t > round_key)
Definition: x86_256-inl.h:3959
HWY_API Vec128< uint8_t > LoadU(Full128< uint8_t >, const uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:2031
HWY_API Vec128< int16_t > MulHigh(const Vec128< int16_t > a, const Vec128< int16_t > b)
Definition: arm_neon-inl.h:1175
HWY_API Vec1< uint8_t > SaturatedSub(const Vec1< uint8_t > a, const Vec1< uint8_t > b)
Definition: scalar-inl.h:484
HWY_INLINE Mask128< T, N > Lt128(Simd< T, N, 0 > d, Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:5172
HWY_API Vec128< float, N > operator/(const Vec128< float, N > a, const Vec128< float, N > b)
Definition: arm_neon-inl.h:1252
HWY_API Vec128< T, N > OrAnd(Vec128< T, N > o, Vec128< T, N > a1, Vec128< T, N > a2)
Definition: arm_neon-inl.h:1498
HWY_API Vec128< T, N > IfNegativeThenElse(Vec128< T, N > v, Vec128< T, N > yes, Vec128< T, N > no)
Definition: arm_neon-inl.h:1724
HWY_API Vec128< T, N > ConcatUpperLower(Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:3895
HWY_API Vec128< T, N > And(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1440
HWY_API Vec128< T, N > BitCast(Simd< T, N, 0 > d, Vec128< FromT, N *sizeof(T)/sizeof(FromT)> v)
Definition: arm_neon-inl.h:710
HWY_API Vec128< int16_t > MulFixedPoint15(Vec128< int16_t > a, Vec128< int16_t > b)
Definition: arm_neon-inl.h:1211
HWY_API void ScatterOffset(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition: arm_neon-inl.h:4231
HWY_API VFromD< DW > ZipUpper(DW dw, V a, V b)
Definition: arm_neon-inl.h:3681
HWY_API Vec128< T, N > ConcatLowerLower(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:3777
HWY_API V Sub(V a, V b)
Definition: arm_neon-inl.h:5221
typename D::template Rebind< T > Rebind
Definition: ops/shared-inl.h:196
HWY_API Vec128< T, N > Undefined(Simd< T, N, 0 >)
Definition: arm_neon-inl.h:747
HWY_API Mask128< T, N > operator>=(Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:1889
HWY_API size_t CompressBitsStore(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:5077
HWY_API Vec128< uint16_t > operator*(const Vec128< uint16_t > a, const Vec128< uint16_t > b)
Definition: arm_neon-inl.h:1133
HWY_API V InterleaveUpper(Simd< T, N, 0 >, V a, V b)
Definition: arm_neon-inl.h:3656
HWY_API Vec128< T, N > GatherOffset(const Simd< T, N, 0 > d, const T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition: arm_neon-inl.h:4267
HWY_API size_t CompressBlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:5061
HWY_API Vec128< T, N > IfThenZeroElse(const Mask128< T, N > mask, const Vec128< T, N > no)
Definition: arm_neon-inl.h:1718
HWY_API Vec128< T, N > Xor(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1489
HWY_INLINE VFromD< D > Max128(D d, const VFromD< D > a, const VFromD< D > b)
Definition: arm_neon-inl.h:5208
HWY_API Vec1< T > ShiftLeft(const Vec1< T > v)
Definition: scalar-inl.h:339
HWY_API auto Le(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:5266
HWY_API Vec128< T, N > ShiftRightSame(const Vec128< T, N > v, int bits)
Definition: arm_neon-inl.h:1126
decltype(detail::DeduceD()(V())) DFromV
Definition: arm_neon-inl.h:555
HWY_API Vec128< int32_t, N > NearestInt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2939
HWY_API Indices128< T, N > SetTableIndices(Simd< T, N, 0 > d, const TI *idx)
Definition: arm_neon-inl.h:3413
HWY_API void ScatterIndex(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition: arm_neon-inl.h:4249
HWY_API Vec128< T > Not(const Vec128< T > v)
Definition: arm_neon-inl.h:1422
HWY_API Vec128< float, N > NegMulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition: arm_neon-inl.h:1323
HWY_API Vec64< uint32_t > Shuffle2301(const Vec64< uint32_t > v)
Definition: arm_neon-inl.h:1778
V Shr(V a, V b)
Definition: arm_neon-inl.h:5239
decltype(Zero(D())) VFromD
Definition: arm_neon-inl.h:743
HWY_API Vec128< T, N > LoadDup128(Simd< T, N, 0 > d, const T *const HWY_RESTRICT p)
Definition: arm_neon-inl.h:2217
HWY_API Vec128< T, N > OddEven(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:4019
HWY_API Vec128< uint16_t > Broadcast(const Vec128< uint16_t > v)
Definition: arm_neon-inl.h:3285
HWY_INLINE Vec128< T, N > CompressBits(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits)
Definition: arm_neon-inl.h:5038
HWY_API Vec128< T > Shuffle0123(const Vec128< T > v)
Definition: arm_neon-inl.h:3553
HWY_API Vec128< float, N > Trunc(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2867
typename D::Half Half
Definition: ops/shared-inl.h:216
HWY_API Vec128< T, N > MinOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4441
HWY_API Vec128< T, N > ShiftRightBytes(Simd< T, N, 0 >, Vec128< T, N > v)
Definition: arm_neon-inl.h:3114
typename D::template Repartition< T > Repartition
Definition: ops/shared-inl.h:207
HWY_API auto Ne(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:5248
N
Definition: rvv-inl.h:1656
HWY_API Vec128< float, N > Sqrt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:1404
HWY_API Vec64< uint16_t > DemoteTo(Full64< uint16_t >, const Vec128< int32_t > v)
Definition: arm_neon-inl.h:2606
HWY_API Vec128< TI > TableLookupBytes(const Vec128< T > bytes, const Vec128< TI > from)
Definition: arm_neon-inl.h:4169
HWY_API size_t CompressStore(Vec128< T, N > v, const Mask128< T, N > mask, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:5052
HWY_API Vec128< uint32_t, N > RotateRight(const Vec128< uint32_t, N > v)
Definition: arm_neon-inl.h:935
HWY_API Vec128< T, N > AndNot(const Vec128< T, N > not_mask, const Vec128< T, N > mask)
Definition: arm_neon-inl.h:1455
HWY_API Vec128< T > ReverseBlocks(Full128< T >, const Vec128< T > v)
Definition: arm_neon-inl.h:4053
HWY_API V Div(V a, V b)
Definition: arm_neon-inl.h:5230
HWY_API Vec128< uint8_t > operator>>(const Vec128< uint8_t > v, const Vec128< uint8_t > bits)
Definition: arm_neon-inl.h:1033
HWY_API Vec128< uint64_t > SumsOf8(const Vec128< uint8_t > v)
Definition: arm_neon-inl.h:852
HWY_API V Mul(V a, V b)
Definition: arm_neon-inl.h:5226
HWY_API Vec128< T, 1 > Reverse(Simd< T, 1, 0 >, const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:3430
HWY_API void Store(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2397
HWY_API Vec128< uint16_t > PromoteTo(Full128< uint16_t >, const Vec64< uint8_t > v)
Definition: arm_neon-inl.h:2426
TFromD< DFromV< V > > TFromV
Definition: arm_neon-inl.h:558
HWY_API Vec64< uint8_t > UpperHalf(Full64< uint8_t >, const Vec128< uint8_t > v)
Definition: arm_neon-inl.h:3146
HWY_API Vec128< float > ApproximateReciprocalSqrt(const Vec128< float > v)
Definition: arm_neon-inl.h:1376
const vfloat64m1_t v
Definition: rvv-inl.h:1656
HWY_API Vec128< T, N > Compress(Vec128< T, N > v, const Mask128< T, N > mask)
Definition: arm_neon-inl.h:5031
HWY_API Vec128< T, N > ShiftRightLanes(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3120
typename D::T TFromD
Definition: ops/shared-inl.h:192
HWY_API VI TableLookupBytesOr0(const V bytes, const VI from)
Definition: arm_neon-inl.h:4224
HWY_API Vec128< T, N > Or(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1477
HWY_API Vec128< float, N > NegMulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition: arm_neon-inl.h:1352
HWY_API Vec1< T > IfThenElse(const Mask1< T > mask, const Vec1< T > yes, const Vec1< T > no)
Definition: scalar-inl.h:278
Definition: aligned_allocator.h:27
HWY_API void CopyBytes(const From *from, To *to)
Definition: base.h:732
HWY_API size_t Num0BitsBelowLS1Bit_Nonzero64(const uint64_t x)
Definition: base.h:608
HWY_API uint64_t Mul128(uint64_t a, uint64_t b, uint64_t *HWY_RESTRICT upper)
Definition: base.h:711
typename detail::TypeFromSize< N >::Unsigned UnsignedFromSize
Definition: base.h:466
constexpr float MantissaEnd< float >()
Definition: base.h:575
HWY_API size_t PopCount(uint64_t x)
Definition: base.h:667
typename detail::Relations< T >::Unsigned MakeUnsigned
Definition: base.h:452
typename detail::Relations< T >::Signed MakeSigned
Definition: base.h:454
#define HWY_ALIGN
Definition: set_macros-inl.h:81
#define HWY_NAMESPACE
Definition: set_macros-inl.h:80
Definition: arm_neon-inl.h:4797
@ value
Definition: arm_neon-inl.h:4798
Definition: arm_neon-inl.h:3368
detail::Raw128< T, N >::type raw
Definition: arm_neon-inl.h:3369
Definition: ops/shared-inl.h:40
Definition: arm_neon-inl.h:545
Simd< T, N, 0 > operator()(Vec128< T, N >) const
Definition: arm_neon-inl.h:547
uint16x4_t type
Definition: arm_neon-inl.h:468
uint16x4_t type
Definition: arm_neon-inl.h:432
uint16x4_t type
Definition: arm_neon-inl.h:379
uint16x8_t type
Definition: arm_neon-inl.h:316
uint16x4_t type
Definition: arm_neon-inl.h:463
uint16x4_t type
Definition: arm_neon-inl.h:427
uint16x4_t type
Definition: arm_neon-inl.h:374
uint16x8_t type
Definition: arm_neon-inl.h:311
float32x2_t type
Definition: arm_neon-inl.h:437
float32x2_t type
Definition: arm_neon-inl.h:384
float32x4_t type
Definition: arm_neon-inl.h:321
int16x4_t type
Definition: arm_neon-inl.h:458
int16x4_t type
Definition: arm_neon-inl.h:417
int16x4_t type
Definition: arm_neon-inl.h:359
int16x8_t type
Definition: arm_neon-inl.h:296
int32x2_t type
Definition: arm_neon-inl.h:422
int32x2_t type
Definition: arm_neon-inl.h:364
int32x4_t type
Definition: arm_neon-inl.h:301
int64x1_t type
Definition: arm_neon-inl.h:369
int64x2_t type
Definition: arm_neon-inl.h:306
int8x16_t type
Definition: arm_neon-inl.h:291
int8x8_t type
Definition: arm_neon-inl.h:479
int8x8_t type
Definition: arm_neon-inl.h:453
int8x8_t type
Definition: arm_neon-inl.h:412
int8x8_t type
Definition: arm_neon-inl.h:354
uint16x4_t type
Definition: arm_neon-inl.h:448
uint16x4_t type
Definition: arm_neon-inl.h:402
uint16x4_t type
Definition: arm_neon-inl.h:339
uint16x8_t type
Definition: arm_neon-inl.h:276
uint32x2_t type
Definition: arm_neon-inl.h:407
uint32x2_t type
Definition: arm_neon-inl.h:344
uint32x4_t type
Definition: arm_neon-inl.h:281
uint64x1_t type
Definition: arm_neon-inl.h:349
uint64x2_t type
Definition: arm_neon-inl.h:286
uint8x16_t type
Definition: arm_neon-inl.h:271
uint8x8_t type
Definition: arm_neon-inl.h:474
uint8x8_t type
Definition: arm_neon-inl.h:443
uint8x8_t type
Definition: arm_neon-inl.h:397
uint8x8_t type
Definition: arm_neon-inl.h:334
Definition: x86_128-inl.h:69
__v128_u type
Definition: wasm_128-inl.h:62
HWY_INLINE Vec128< T, N > operator()(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3046
HWY_INLINE Vec128< T, N > operator()(const Vec128< T, N >)
Definition: arm_neon-inl.h:3053
Definition: arm_neon-inl.h:3024
HWY_INLINE Vec128< T > operator()(const Vec128< T > v)
Definition: arm_neon-inl.h:3027
HWY_INLINE Vec128< T, N > operator()(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3034
HWY_INLINE Vec128< T, N > operator()(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3076
HWY_INLINE Vec128< T, N > operator()(const Vec128< T, N >)
Definition: arm_neon-inl.h:3083
Definition: arm_neon-inl.h:3059
HWY_INLINE Vec128< T, N > operator()(Vec128< T, N > v)
Definition: arm_neon-inl.h:3061