Grok  9.7.5
arm_neon-inl.h
Go to the documentation of this file.
1 // Copyright 2019 Google LLC
2 // SPDX-License-Identifier: Apache-2.0
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 // http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 
16 // 128-bit ARM64 NEON vectors and operations.
17 // External include guard in highway.h - see comment there.
18 
19 // ARM NEON intrinsics are documented at:
20 // https://developer.arm.com/architectures/instruction-sets/intrinsics/#f:@navigationhierarchiessimdisa=[Neon]
21 
22 #include <arm_neon.h>
23 #include <stddef.h>
24 #include <stdint.h>
25 
26 #include "hwy/base.h"
27 #include "hwy/ops/shared-inl.h"
28 
30 namespace hwy {
31 namespace HWY_NAMESPACE {
32 
33 template <typename T>
34 using Full128 = Simd<T, 16 / sizeof(T), 0>;
35 
36 template <typename T>
37 using Full64 = Simd<T, 8 / sizeof(T), 0>;
38 
39 template <typename T>
40 using Full32 = Simd<T, 4 / sizeof(T), 0>;
41 
42 namespace detail { // for code folding and Raw128
43 
44 // Macros used to define single and double function calls for multiple types
45 // for full and half vectors. These macros are undefined at the end of the file.
46 
47 // HWY_NEON_BUILD_TPL_* is the template<...> prefix to the function.
48 #define HWY_NEON_BUILD_TPL_1
49 #define HWY_NEON_BUILD_TPL_2
50 #define HWY_NEON_BUILD_TPL_3
51 
52 // HWY_NEON_BUILD_RET_* is return type; type arg is without _t suffix so we can
53 // extend it to int32x4x2_t packs.
54 #define HWY_NEON_BUILD_RET_1(type, size) Vec128<type##_t, size>
55 #define HWY_NEON_BUILD_RET_2(type, size) Vec128<type##_t, size>
56 #define HWY_NEON_BUILD_RET_3(type, size) Vec128<type##_t, size>
57 
58 // HWY_NEON_BUILD_PARAM_* is the list of parameters the function receives.
59 #define HWY_NEON_BUILD_PARAM_1(type, size) const Vec128<type##_t, size> a
60 #define HWY_NEON_BUILD_PARAM_2(type, size) \
61  const Vec128<type##_t, size> a, const Vec128<type##_t, size> b
62 #define HWY_NEON_BUILD_PARAM_3(type, size) \
63  const Vec128<type##_t, size> a, const Vec128<type##_t, size> b, \
64  const Vec128<type##_t, size> c
65 
66 // HWY_NEON_BUILD_ARG_* is the list of arguments passed to the underlying
67 // function.
68 #define HWY_NEON_BUILD_ARG_1 a.raw
69 #define HWY_NEON_BUILD_ARG_2 a.raw, b.raw
70 #define HWY_NEON_BUILD_ARG_3 a.raw, b.raw, c.raw
71 
72 // We use HWY_NEON_EVAL(func, ...) to delay the evaluation of func until after
73 // the __VA_ARGS__ have been expanded. This allows "func" to be a macro on
74 // itself like with some of the library "functions" such as vshlq_u8. For
75 // example, HWY_NEON_EVAL(vshlq_u8, MY_PARAMS) where MY_PARAMS is defined as
76 // "a, b" (without the quotes) will end up expanding "vshlq_u8(a, b)" if needed.
77 // Directly writing vshlq_u8(MY_PARAMS) would fail since vshlq_u8() macro
78 // expects two arguments.
79 #define HWY_NEON_EVAL(func, ...) func(__VA_ARGS__)
80 
81 // Main macro definition that defines a single function for the given type and
82 // size of vector, using the underlying (prefix##infix##suffix) function and
83 // the template, return type, parameters and arguments defined by the "args"
84 // parameters passed here (see HWY_NEON_BUILD_* macros defined before).
85 #define HWY_NEON_DEF_FUNCTION(type, size, name, prefix, infix, suffix, args) \
86  HWY_CONCAT(HWY_NEON_BUILD_TPL_, args) \
87  HWY_API HWY_CONCAT(HWY_NEON_BUILD_RET_, args)(type, size) \
88  name(HWY_CONCAT(HWY_NEON_BUILD_PARAM_, args)(type, size)) { \
89  return HWY_CONCAT(HWY_NEON_BUILD_RET_, args)(type, size)( \
90  HWY_NEON_EVAL(prefix##infix##suffix, HWY_NEON_BUILD_ARG_##args)); \
91  }
92 
93 // The HWY_NEON_DEF_FUNCTION_* macros define all the variants of a function
94 // called "name" using the set of neon functions starting with the given
95 // "prefix" for all the variants of certain types, as specified next to each
96 // macro. For example, the prefix "vsub" can be used to define the operator-
97 // using args=2.
98 
99 // uint8_t
100 #define HWY_NEON_DEF_FUNCTION_UINT_8(name, prefix, infix, args) \
101  HWY_NEON_DEF_FUNCTION(uint8, 16, name, prefix##q, infix, u8, args) \
102  HWY_NEON_DEF_FUNCTION(uint8, 8, name, prefix, infix, u8, args) \
103  HWY_NEON_DEF_FUNCTION(uint8, 4, name, prefix, infix, u8, args) \
104  HWY_NEON_DEF_FUNCTION(uint8, 2, name, prefix, infix, u8, args) \
105  HWY_NEON_DEF_FUNCTION(uint8, 1, name, prefix, infix, u8, args)
106 
107 // int8_t
108 #define HWY_NEON_DEF_FUNCTION_INT_8(name, prefix, infix, args) \
109  HWY_NEON_DEF_FUNCTION(int8, 16, name, prefix##q, infix, s8, args) \
110  HWY_NEON_DEF_FUNCTION(int8, 8, name, prefix, infix, s8, args) \
111  HWY_NEON_DEF_FUNCTION(int8, 4, name, prefix, infix, s8, args) \
112  HWY_NEON_DEF_FUNCTION(int8, 2, name, prefix, infix, s8, args) \
113  HWY_NEON_DEF_FUNCTION(int8, 1, name, prefix, infix, s8, args)
114 
115 // uint16_t
116 #define HWY_NEON_DEF_FUNCTION_UINT_16(name, prefix, infix, args) \
117  HWY_NEON_DEF_FUNCTION(uint16, 8, name, prefix##q, infix, u16, args) \
118  HWY_NEON_DEF_FUNCTION(uint16, 4, name, prefix, infix, u16, args) \
119  HWY_NEON_DEF_FUNCTION(uint16, 2, name, prefix, infix, u16, args) \
120  HWY_NEON_DEF_FUNCTION(uint16, 1, name, prefix, infix, u16, args)
121 
122 // int16_t
123 #define HWY_NEON_DEF_FUNCTION_INT_16(name, prefix, infix, args) \
124  HWY_NEON_DEF_FUNCTION(int16, 8, name, prefix##q, infix, s16, args) \
125  HWY_NEON_DEF_FUNCTION(int16, 4, name, prefix, infix, s16, args) \
126  HWY_NEON_DEF_FUNCTION(int16, 2, name, prefix, infix, s16, args) \
127  HWY_NEON_DEF_FUNCTION(int16, 1, name, prefix, infix, s16, args)
128 
129 // uint32_t
130 #define HWY_NEON_DEF_FUNCTION_UINT_32(name, prefix, infix, args) \
131  HWY_NEON_DEF_FUNCTION(uint32, 4, name, prefix##q, infix, u32, args) \
132  HWY_NEON_DEF_FUNCTION(uint32, 2, name, prefix, infix, u32, args) \
133  HWY_NEON_DEF_FUNCTION(uint32, 1, name, prefix, infix, u32, args)
134 
135 // int32_t
136 #define HWY_NEON_DEF_FUNCTION_INT_32(name, prefix, infix, args) \
137  HWY_NEON_DEF_FUNCTION(int32, 4, name, prefix##q, infix, s32, args) \
138  HWY_NEON_DEF_FUNCTION(int32, 2, name, prefix, infix, s32, args) \
139  HWY_NEON_DEF_FUNCTION(int32, 1, name, prefix, infix, s32, args)
140 
141 // uint64_t
142 #define HWY_NEON_DEF_FUNCTION_UINT_64(name, prefix, infix, args) \
143  HWY_NEON_DEF_FUNCTION(uint64, 2, name, prefix##q, infix, u64, args) \
144  HWY_NEON_DEF_FUNCTION(uint64, 1, name, prefix, infix, u64, args)
145 
146 // int64_t
147 #define HWY_NEON_DEF_FUNCTION_INT_64(name, prefix, infix, args) \
148  HWY_NEON_DEF_FUNCTION(int64, 2, name, prefix##q, infix, s64, args) \
149  HWY_NEON_DEF_FUNCTION(int64, 1, name, prefix, infix, s64, args)
150 
151 // float
152 #define HWY_NEON_DEF_FUNCTION_FLOAT_32(name, prefix, infix, args) \
153  HWY_NEON_DEF_FUNCTION(float32, 4, name, prefix##q, infix, f32, args) \
154  HWY_NEON_DEF_FUNCTION(float32, 2, name, prefix, infix, f32, args) \
155  HWY_NEON_DEF_FUNCTION(float32, 1, name, prefix, infix, f32, args)
156 
157 // double
158 #define HWY_NEON_DEF_FUNCTION_FLOAT_64(name, prefix, infix, args) \
159  HWY_NEON_DEF_FUNCTION(float64, 2, name, prefix##q, infix, f64, args) \
160  HWY_NEON_DEF_FUNCTION(float64, 1, name, prefix, infix, f64, args)
161 
162 // float and double
163 #if HWY_ARCH_ARM_A64
164 #define HWY_NEON_DEF_FUNCTION_ALL_FLOATS(name, prefix, infix, args) \
165  HWY_NEON_DEF_FUNCTION_FLOAT_32(name, prefix, infix, args) \
166  HWY_NEON_DEF_FUNCTION_FLOAT_64(name, prefix, infix, args)
167 #else
168 #define HWY_NEON_DEF_FUNCTION_ALL_FLOATS(name, prefix, infix, args) \
169  HWY_NEON_DEF_FUNCTION_FLOAT_32(name, prefix, infix, args)
170 #endif
171 
172 // Helper macros to define for more than one type.
173 // uint8_t, uint16_t and uint32_t
174 #define HWY_NEON_DEF_FUNCTION_UINT_8_16_32(name, prefix, infix, args) \
175  HWY_NEON_DEF_FUNCTION_UINT_8(name, prefix, infix, args) \
176  HWY_NEON_DEF_FUNCTION_UINT_16(name, prefix, infix, args) \
177  HWY_NEON_DEF_FUNCTION_UINT_32(name, prefix, infix, args)
178 
179 // int8_t, int16_t and int32_t
180 #define HWY_NEON_DEF_FUNCTION_INT_8_16_32(name, prefix, infix, args) \
181  HWY_NEON_DEF_FUNCTION_INT_8(name, prefix, infix, args) \
182  HWY_NEON_DEF_FUNCTION_INT_16(name, prefix, infix, args) \
183  HWY_NEON_DEF_FUNCTION_INT_32(name, prefix, infix, args)
184 
185 // uint8_t, uint16_t, uint32_t and uint64_t
186 #define HWY_NEON_DEF_FUNCTION_UINTS(name, prefix, infix, args) \
187  HWY_NEON_DEF_FUNCTION_UINT_8_16_32(name, prefix, infix, args) \
188  HWY_NEON_DEF_FUNCTION_UINT_64(name, prefix, infix, args)
189 
190 // int8_t, int16_t, int32_t and int64_t
191 #define HWY_NEON_DEF_FUNCTION_INTS(name, prefix, infix, args) \
192  HWY_NEON_DEF_FUNCTION_INT_8_16_32(name, prefix, infix, args) \
193  HWY_NEON_DEF_FUNCTION_INT_64(name, prefix, infix, args)
194 
195 // All int*_t and uint*_t up to 64
196 #define HWY_NEON_DEF_FUNCTION_INTS_UINTS(name, prefix, infix, args) \
197  HWY_NEON_DEF_FUNCTION_INTS(name, prefix, infix, args) \
198  HWY_NEON_DEF_FUNCTION_UINTS(name, prefix, infix, args)
199 
200 // All previous types.
201 #define HWY_NEON_DEF_FUNCTION_ALL_TYPES(name, prefix, infix, args) \
202  HWY_NEON_DEF_FUNCTION_INTS_UINTS(name, prefix, infix, args) \
203  HWY_NEON_DEF_FUNCTION_ALL_FLOATS(name, prefix, infix, args)
204 
205 // Emulation of some intrinsics on armv7.
206 #if HWY_ARCH_ARM_V7
207 #define vuzp1_s8(x, y) vuzp_s8(x, y).val[0]
208 #define vuzp1_u8(x, y) vuzp_u8(x, y).val[0]
209 #define vuzp1_s16(x, y) vuzp_s16(x, y).val[0]
210 #define vuzp1_u16(x, y) vuzp_u16(x, y).val[0]
211 #define vuzp1_s32(x, y) vuzp_s32(x, y).val[0]
212 #define vuzp1_u32(x, y) vuzp_u32(x, y).val[0]
213 #define vuzp1_f32(x, y) vuzp_f32(x, y).val[0]
214 #define vuzp1q_s8(x, y) vuzpq_s8(x, y).val[0]
215 #define vuzp1q_u8(x, y) vuzpq_u8(x, y).val[0]
216 #define vuzp1q_s16(x, y) vuzpq_s16(x, y).val[0]
217 #define vuzp1q_u16(x, y) vuzpq_u16(x, y).val[0]
218 #define vuzp1q_s32(x, y) vuzpq_s32(x, y).val[0]
219 #define vuzp1q_u32(x, y) vuzpq_u32(x, y).val[0]
220 #define vuzp1q_f32(x, y) vuzpq_f32(x, y).val[0]
221 #define vuzp2_s8(x, y) vuzp_s8(x, y).val[1]
222 #define vuzp2_u8(x, y) vuzp_u8(x, y).val[1]
223 #define vuzp2_s16(x, y) vuzp_s16(x, y).val[1]
224 #define vuzp2_u16(x, y) vuzp_u16(x, y).val[1]
225 #define vuzp2_s32(x, y) vuzp_s32(x, y).val[1]
226 #define vuzp2_u32(x, y) vuzp_u32(x, y).val[1]
227 #define vuzp2_f32(x, y) vuzp_f32(x, y).val[1]
228 #define vuzp2q_s8(x, y) vuzpq_s8(x, y).val[1]
229 #define vuzp2q_u8(x, y) vuzpq_u8(x, y).val[1]
230 #define vuzp2q_s16(x, y) vuzpq_s16(x, y).val[1]
231 #define vuzp2q_u16(x, y) vuzpq_u16(x, y).val[1]
232 #define vuzp2q_s32(x, y) vuzpq_s32(x, y).val[1]
233 #define vuzp2q_u32(x, y) vuzpq_u32(x, y).val[1]
234 #define vuzp2q_f32(x, y) vuzpq_f32(x, y).val[1]
235 #define vzip1_s8(x, y) vzip_s8(x, y).val[0]
236 #define vzip1_u8(x, y) vzip_u8(x, y).val[0]
237 #define vzip1_s16(x, y) vzip_s16(x, y).val[0]
238 #define vzip1_u16(x, y) vzip_u16(x, y).val[0]
239 #define vzip1_f32(x, y) vzip_f32(x, y).val[0]
240 #define vzip1_u32(x, y) vzip_u32(x, y).val[0]
241 #define vzip1_s32(x, y) vzip_s32(x, y).val[0]
242 #define vzip1q_s8(x, y) vzipq_s8(x, y).val[0]
243 #define vzip1q_u8(x, y) vzipq_u8(x, y).val[0]
244 #define vzip1q_s16(x, y) vzipq_s16(x, y).val[0]
245 #define vzip1q_u16(x, y) vzipq_u16(x, y).val[0]
246 #define vzip1q_s32(x, y) vzipq_s32(x, y).val[0]
247 #define vzip1q_u32(x, y) vzipq_u32(x, y).val[0]
248 #define vzip1q_f32(x, y) vzipq_f32(x, y).val[0]
249 #define vzip2_s8(x, y) vzip_s8(x, y).val[1]
250 #define vzip2_u8(x, y) vzip_u8(x, y).val[1]
251 #define vzip2_s16(x, y) vzip_s16(x, y).val[1]
252 #define vzip2_u16(x, y) vzip_u16(x, y).val[1]
253 #define vzip2_s32(x, y) vzip_s32(x, y).val[1]
254 #define vzip2_u32(x, y) vzip_u32(x, y).val[1]
255 #define vzip2_f32(x, y) vzip_f32(x, y).val[1]
256 #define vzip2q_s8(x, y) vzipq_s8(x, y).val[1]
257 #define vzip2q_u8(x, y) vzipq_u8(x, y).val[1]
258 #define vzip2q_s16(x, y) vzipq_s16(x, y).val[1]
259 #define vzip2q_u16(x, y) vzipq_u16(x, y).val[1]
260 #define vzip2q_s32(x, y) vzipq_s32(x, y).val[1]
261 #define vzip2q_u32(x, y) vzipq_u32(x, y).val[1]
262 #define vzip2q_f32(x, y) vzipq_f32(x, y).val[1]
263 #endif
264 
265 template <typename T, size_t N>
266 struct Raw128;
267 
268 // 128
269 template <>
270 struct Raw128<uint8_t, 16> {
271  using type = uint8x16_t;
272 };
273 
274 template <>
275 struct Raw128<uint16_t, 8> {
276  using type = uint16x8_t;
277 };
278 
279 template <>
280 struct Raw128<uint32_t, 4> {
281  using type = uint32x4_t;
282 };
283 
284 template <>
285 struct Raw128<uint64_t, 2> {
286  using type = uint64x2_t;
287 };
288 
289 template <>
290 struct Raw128<int8_t, 16> {
291  using type = int8x16_t;
292 };
293 
294 template <>
295 struct Raw128<int16_t, 8> {
296  using type = int16x8_t;
297 };
298 
299 template <>
300 struct Raw128<int32_t, 4> {
301  using type = int32x4_t;
302 };
303 
304 template <>
305 struct Raw128<int64_t, 2> {
306  using type = int64x2_t;
307 };
308 
309 template <>
310 struct Raw128<float16_t, 8> {
311  using type = uint16x8_t;
312 };
313 
314 template <>
315 struct Raw128<bfloat16_t, 8> {
316  using type = uint16x8_t;
317 };
318 
319 template <>
320 struct Raw128<float, 4> {
321  using type = float32x4_t;
322 };
323 
324 #if HWY_ARCH_ARM_A64
325 template <>
326 struct Raw128<double, 2> {
327  using type = float64x2_t;
328 };
329 #endif
330 
331 // 64
332 template <>
333 struct Raw128<uint8_t, 8> {
334  using type = uint8x8_t;
335 };
336 
337 template <>
338 struct Raw128<uint16_t, 4> {
339  using type = uint16x4_t;
340 };
341 
342 template <>
343 struct Raw128<uint32_t, 2> {
344  using type = uint32x2_t;
345 };
346 
347 template <>
348 struct Raw128<uint64_t, 1> {
349  using type = uint64x1_t;
350 };
351 
352 template <>
353 struct Raw128<int8_t, 8> {
354  using type = int8x8_t;
355 };
356 
357 template <>
358 struct Raw128<int16_t, 4> {
359  using type = int16x4_t;
360 };
361 
362 template <>
363 struct Raw128<int32_t, 2> {
364  using type = int32x2_t;
365 };
366 
367 template <>
368 struct Raw128<int64_t, 1> {
369  using type = int64x1_t;
370 };
371 
372 template <>
373 struct Raw128<float16_t, 4> {
374  using type = uint16x4_t;
375 };
376 
377 template <>
378 struct Raw128<bfloat16_t, 4> {
379  using type = uint16x4_t;
380 };
381 
382 template <>
383 struct Raw128<float, 2> {
384  using type = float32x2_t;
385 };
386 
387 #if HWY_ARCH_ARM_A64
388 template <>
389 struct Raw128<double, 1> {
390  using type = float64x1_t;
391 };
392 #endif
393 
394 // 32 (same as 64)
395 template <>
396 struct Raw128<uint8_t, 4> {
397  using type = uint8x8_t;
398 };
399 
400 template <>
401 struct Raw128<uint16_t, 2> {
402  using type = uint16x4_t;
403 };
404 
405 template <>
406 struct Raw128<uint32_t, 1> {
407  using type = uint32x2_t;
408 };
409 
410 template <>
411 struct Raw128<int8_t, 4> {
412  using type = int8x8_t;
413 };
414 
415 template <>
416 struct Raw128<int16_t, 2> {
417  using type = int16x4_t;
418 };
419 
420 template <>
421 struct Raw128<int32_t, 1> {
422  using type = int32x2_t;
423 };
424 
425 template <>
426 struct Raw128<float16_t, 2> {
427  using type = uint16x4_t;
428 };
429 
430 template <>
431 struct Raw128<bfloat16_t, 2> {
432  using type = uint16x4_t;
433 };
434 
435 template <>
436 struct Raw128<float, 1> {
437  using type = float32x2_t;
438 };
439 
440 // 16 (same as 64)
441 template <>
442 struct Raw128<uint8_t, 2> {
443  using type = uint8x8_t;
444 };
445 
446 template <>
447 struct Raw128<uint16_t, 1> {
448  using type = uint16x4_t;
449 };
450 
451 template <>
452 struct Raw128<int8_t, 2> {
453  using type = int8x8_t;
454 };
455 
456 template <>
457 struct Raw128<int16_t, 1> {
458  using type = int16x4_t;
459 };
460 
461 template <>
462 struct Raw128<float16_t, 1> {
463  using type = uint16x4_t;
464 };
465 
466 template <>
467 struct Raw128<bfloat16_t, 1> {
468  using type = uint16x4_t;
469 };
470 
471 // 8 (same as 64)
472 template <>
473 struct Raw128<uint8_t, 1> {
474  using type = uint8x8_t;
475 };
476 
477 template <>
478 struct Raw128<int8_t, 1> {
479  using type = int8x8_t;
480 };
481 
482 } // namespace detail
483 
484 template <typename T, size_t N = 16 / sizeof(T)>
485 class Vec128 {
486  using Raw = typename detail::Raw128<T, N>::type;
487 
488  public:
490  Vec128(const Vec128&) = default;
491  Vec128& operator=(const Vec128&) = default;
492  HWY_INLINE explicit Vec128(const Raw raw) : raw(raw) {}
493 
494  // Compound assignment. Only usable if there is a corresponding non-member
495  // binary operator overload. For example, only f32 and f64 support division.
497  return *this = (*this * other);
498  }
500  return *this = (*this / other);
501  }
503  return *this = (*this + other);
504  }
506  return *this = (*this - other);
507  }
509  return *this = (*this & other);
510  }
512  return *this = (*this | other);
513  }
515  return *this = (*this ^ other);
516  }
517 
519 };
520 
521 template <typename T>
522 using Vec64 = Vec128<T, 8 / sizeof(T)>;
523 
524 template <typename T>
525 using Vec32 = Vec128<T, 4 / sizeof(T)>;
526 
527 // FF..FF or 0.
528 template <typename T, size_t N = 16 / sizeof(T)>
529 class Mask128 {
530  // ARM C Language Extensions return and expect unsigned type.
531  using Raw = typename detail::Raw128<MakeUnsigned<T>, N>::type;
532 
533  public:
535  Mask128(const Mask128&) = default;
536  Mask128& operator=(const Mask128&) = default;
537  HWY_INLINE explicit Mask128(const Raw raw) : raw(raw) {}
538 
540 };
541 
542 namespace detail {
543 
544 // Deduce Simd<T, N, 0> from Vec128<T, N>
545 struct DeduceD {
546  template <typename T, size_t N>
548  return Simd<T, N, 0>();
549  }
550 };
551 
552 } // namespace detail
553 
554 template <class V>
555 using DFromV = decltype(detail::DeduceD()(V()));
556 
557 template <class V>
559 
560 // ------------------------------ BitCast
561 
562 namespace detail {
563 
564 // Converts from Vec128<T, N> to Vec128<uint8_t, N * sizeof(T)> using the
565 // vreinterpret*_u8_*() set of functions.
566 #define HWY_NEON_BUILD_TPL_HWY_CAST_TO_U8
567 #define HWY_NEON_BUILD_RET_HWY_CAST_TO_U8(type, size) \
568  Vec128<uint8_t, size * sizeof(type##_t)>
569 #define HWY_NEON_BUILD_PARAM_HWY_CAST_TO_U8(type, size) Vec128<type##_t, size> v
570 #define HWY_NEON_BUILD_ARG_HWY_CAST_TO_U8 v.raw
571 
572 // Special case of u8 to u8 since vreinterpret*_u8_u8 is obviously not defined.
573 template <size_t N>
575  return v;
576 }
577 
579  HWY_CAST_TO_U8)
580 HWY_NEON_DEF_FUNCTION_INTS(BitCastToByte, vreinterpret, _u8_, HWY_CAST_TO_U8)
581 HWY_NEON_DEF_FUNCTION_UINT_16(BitCastToByte, vreinterpret, _u8_, HWY_CAST_TO_U8)
582 HWY_NEON_DEF_FUNCTION_UINT_32(BitCastToByte, vreinterpret, _u8_, HWY_CAST_TO_U8)
583 HWY_NEON_DEF_FUNCTION_UINT_64(BitCastToByte, vreinterpret, _u8_, HWY_CAST_TO_U8)
584 
585 // Special cases for [b]float16_t, which have the same Raw as uint16_t.
586 template <size_t N>
588  return BitCastToByte(Vec128<uint16_t, N>(v.raw));
589 }
590 template <size_t N>
592  return BitCastToByte(Vec128<uint16_t, N>(v.raw));
593 }
594 
595 #undef HWY_NEON_BUILD_TPL_HWY_CAST_TO_U8
596 #undef HWY_NEON_BUILD_RET_HWY_CAST_TO_U8
597 #undef HWY_NEON_BUILD_PARAM_HWY_CAST_TO_U8
598 #undef HWY_NEON_BUILD_ARG_HWY_CAST_TO_U8
599 
600 template <size_t N>
603  return v;
604 }
605 
606 // 64-bit or less:
607 
608 template <size_t N, HWY_IF_LE64(int8_t, N)>
611  return Vec128<int8_t, N>(vreinterpret_s8_u8(v.raw));
612 }
613 template <size_t N, HWY_IF_LE64(uint16_t, N)>
616  return Vec128<uint16_t, N>(vreinterpret_u16_u8(v.raw));
617 }
618 template <size_t N, HWY_IF_LE64(int16_t, N)>
621  return Vec128<int16_t, N>(vreinterpret_s16_u8(v.raw));
622 }
623 template <size_t N, HWY_IF_LE64(uint32_t, N)>
626  return Vec128<uint32_t, N>(vreinterpret_u32_u8(v.raw));
627 }
628 template <size_t N, HWY_IF_LE64(int32_t, N)>
631  return Vec128<int32_t, N>(vreinterpret_s32_u8(v.raw));
632 }
633 template <size_t N, HWY_IF_LE64(float, N)>
636  return Vec128<float, N>(vreinterpret_f32_u8(v.raw));
637 }
640  return Vec64<uint64_t>(vreinterpret_u64_u8(v.raw));
641 }
644  return Vec64<int64_t>(vreinterpret_s64_u8(v.raw));
645 }
646 #if HWY_ARCH_ARM_A64
649  return Vec64<double>(vreinterpret_f64_u8(v.raw));
650 }
651 #endif
652 
653 // 128-bit full:
654 
656  Vec128<uint8_t> v) {
657  return Vec128<int8_t>(vreinterpretq_s8_u8(v.raw));
658 }
660  Vec128<uint8_t> v) {
661  return Vec128<uint16_t>(vreinterpretq_u16_u8(v.raw));
662 }
664  Vec128<uint8_t> v) {
665  return Vec128<int16_t>(vreinterpretq_s16_u8(v.raw));
666 }
668  Vec128<uint8_t> v) {
669  return Vec128<uint32_t>(vreinterpretq_u32_u8(v.raw));
670 }
672  Vec128<uint8_t> v) {
673  return Vec128<int32_t>(vreinterpretq_s32_u8(v.raw));
674 }
676  Vec128<uint8_t> v) {
677  return Vec128<float>(vreinterpretq_f32_u8(v.raw));
678 }
680  Vec128<uint8_t> v) {
681  return Vec128<uint64_t>(vreinterpretq_u64_u8(v.raw));
682 }
684  Vec128<uint8_t> v) {
685  return Vec128<int64_t>(vreinterpretq_s64_u8(v.raw));
686 }
687 
688 #if HWY_ARCH_ARM_A64
690  Vec128<uint8_t> v) {
691  return Vec128<double>(vreinterpretq_f64_u8(v.raw));
692 }
693 #endif
694 
695 // Special cases for [b]float16_t, which have the same Raw as uint16_t.
696 template <size_t N>
700 }
701 template <size_t N>
705 }
706 
707 } // namespace detail
708 
709 template <typename T, size_t N, typename FromT>
711  Vec128<FromT, N * sizeof(T) / sizeof(FromT)> v) {
713 }
714 
715 // ------------------------------ Set
716 
717 // Returns a vector with all lanes set to "t".
718 #define HWY_NEON_BUILD_TPL_HWY_SET1
719 #define HWY_NEON_BUILD_RET_HWY_SET1(type, size) Vec128<type##_t, size>
720 #define HWY_NEON_BUILD_PARAM_HWY_SET1(type, size) \
721  Simd<type##_t, size, 0> /* tag */, const type##_t t
722 #define HWY_NEON_BUILD_ARG_HWY_SET1 t
723 
724 HWY_NEON_DEF_FUNCTION_ALL_TYPES(Set, vdup, _n_, HWY_SET1)
725 
726 #undef HWY_NEON_BUILD_TPL_HWY_SET1
727 #undef HWY_NEON_BUILD_RET_HWY_SET1
728 #undef HWY_NEON_BUILD_PARAM_HWY_SET1
729 #undef HWY_NEON_BUILD_ARG_HWY_SET1
730 
731 // Returns an all-zero vector.
732 template <typename T, size_t N>
734  return Set(d, 0);
735 }
736 
737 template <size_t N>
740 }
741 
742 template <class D>
743 using VFromD = decltype(Zero(D()));
744 
745 // Returns a vector with uninitialized elements.
746 template <typename T, size_t N>
748  HWY_DIAGNOSTICS(push)
749  HWY_DIAGNOSTICS_OFF(disable : 4701, ignored "-Wuninitialized")
750  typename detail::Raw128<T, N>::type a;
751  return Vec128<T, N>(a);
752  HWY_DIAGNOSTICS(pop)
753 }
754 
755 // Returns a vector with lane i=[0, N) set to "first" + i.
756 template <typename T, size_t N, typename T2>
757 Vec128<T, N> Iota(const Simd<T, N, 0> d, const T2 first) {
758  HWY_ALIGN T lanes[16 / sizeof(T)];
759  for (size_t i = 0; i < 16 / sizeof(T); ++i) {
760  lanes[i] = static_cast<T>(first + static_cast<T2>(i));
761  }
762  return Load(d, lanes);
763 }
764 
765 // ------------------------------ GetLane
766 
768  return vgetq_lane_u8(v.raw, 0);
769 }
770 template <size_t N>
772  return vget_lane_u8(v.raw, 0);
773 }
774 
776  return vgetq_lane_s8(v.raw, 0);
777 }
778 template <size_t N>
780  return vget_lane_s8(v.raw, 0);
781 }
782 
784  return vgetq_lane_u16(v.raw, 0);
785 }
786 template <size_t N>
788  return vget_lane_u16(v.raw, 0);
789 }
790 
792  return vgetq_lane_s16(v.raw, 0);
793 }
794 template <size_t N>
796  return vget_lane_s16(v.raw, 0);
797 }
798 
800  return vgetq_lane_u32(v.raw, 0);
801 }
802 template <size_t N>
804  return vget_lane_u32(v.raw, 0);
805 }
806 
808  return vgetq_lane_s32(v.raw, 0);
809 }
810 template <size_t N>
812  return vget_lane_s32(v.raw, 0);
813 }
814 
816  return vgetq_lane_u64(v.raw, 0);
817 }
818 HWY_API uint64_t GetLane(const Vec64<uint64_t> v) {
819  return vget_lane_u64(v.raw, 0);
820 }
822  return vgetq_lane_s64(v.raw, 0);
823 }
824 HWY_API int64_t GetLane(const Vec64<int64_t> v) {
825  return vget_lane_s64(v.raw, 0);
826 }
827 
829  return vgetq_lane_f32(v.raw, 0);
830 }
831 HWY_API float GetLane(const Vec64<float> v) { return vget_lane_f32(v.raw, 0); }
832 HWY_API float GetLane(const Vec32<float> v) { return vget_lane_f32(v.raw, 0); }
833 #if HWY_ARCH_ARM_A64
834 HWY_API double GetLane(const Vec128<double, 2> v) {
835  return vgetq_lane_f64(v.raw, 0);
836 }
837 HWY_API double GetLane(const Vec64<double> v) {
838  return vget_lane_f64(v.raw, 0);
839 }
840 #endif
841 
842 // ================================================== ARITHMETIC
843 
844 // ------------------------------ Addition
845 HWY_NEON_DEF_FUNCTION_ALL_TYPES(operator+, vadd, _, 2)
846 
847 // ------------------------------ Subtraction
848 HWY_NEON_DEF_FUNCTION_ALL_TYPES(operator-, vsub, _, 2)
849 
850 // ------------------------------ SumsOf8
851 
852 HWY_API Vec128<uint64_t> SumsOf8(const Vec128<uint8_t> v) {
853  return Vec128<uint64_t>(vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(v.raw))));
854 }
856  return Vec64<uint64_t>(vpaddl_u32(vpaddl_u16(vpaddl_u8(v.raw))));
857 }
858 
859 // ------------------------------ SaturatedAdd
860 // Only defined for uint8_t, uint16_t and their signed versions, as in other
861 // architectures.
862 
863 // Returns a + b clamped to the destination range.
868 
869 // ------------------------------ SaturatedSub
870 
871 // Returns a - b clamped to the destination range.
876 
877 // Not part of API, used in implementation.
878 namespace detail {
883 } // namespace detail
884 
885 // ------------------------------ Average
886 
887 // Returns (a + b + 1) / 2
890 
891 // ------------------------------ Neg
892 
894 HWY_NEON_DEF_FUNCTION_INT_8_16_32(Neg, vneg, _, 1) // i64 implemented below
895 
896 HWY_API Vec64<int64_t> Neg(const Vec64<int64_t> v) {
897 #if HWY_ARCH_ARM_A64
898  return Vec64<int64_t>(vneg_s64(v.raw));
899 #else
900  return Zero(Full64<int64_t>()) - v;
901 #endif
902 }
903 
905 #if HWY_ARCH_ARM_A64
906  return Vec128<int64_t>(vnegq_s64(v.raw));
907 #else
908  return Zero(Full128<int64_t>()) - v;
909 #endif
910 }
911 
912 // ------------------------------ ShiftLeft
913 
914 // Customize HWY_NEON_DEF_FUNCTION to special-case count=0 (not supported).
915 #pragma push_macro("HWY_NEON_DEF_FUNCTION")
916 #undef HWY_NEON_DEF_FUNCTION
917 #define HWY_NEON_DEF_FUNCTION(type, size, name, prefix, infix, suffix, args) \
918  template <int kBits> \
919  HWY_API Vec128<type##_t, size> name(const Vec128<type##_t, size> v) { \
920  return kBits == 0 ? v \
921  : Vec128<type##_t, size>(HWY_NEON_EVAL( \
922  prefix##infix##suffix, v.raw, HWY_MAX(1, kBits))); \
923  }
924 
925 HWY_NEON_DEF_FUNCTION_INTS_UINTS(ShiftLeft, vshl, _n_, HWY_SHIFT)
926 
927 HWY_NEON_DEF_FUNCTION_UINTS(ShiftRight, vshr, _n_, HWY_SHIFT)
928 HWY_NEON_DEF_FUNCTION_INTS(ShiftRight, vshr, _n_, HWY_SHIFT)
929 
930 #pragma pop_macro("HWY_NEON_DEF_FUNCTION")
931 
932 // ------------------------------ RotateRight (ShiftRight, Or)
933 
934 template <int kBits, size_t N>
936  static_assert(0 <= kBits && kBits < 32, "Invalid shift count");
937  if (kBits == 0) return v;
938  return Or(ShiftRight<kBits>(v), ShiftLeft<HWY_MIN(31, 32 - kBits)>(v));
939 }
940 
941 template <int kBits, size_t N>
943  static_assert(0 <= kBits && kBits < 64, "Invalid shift count");
944  if (kBits == 0) return v;
945  return Or(ShiftRight<kBits>(v), ShiftLeft<HWY_MIN(63, 64 - kBits)>(v));
946 }
947 
948 // NOTE: vxarq_u64 can be applied to uint64_t, but we do not yet have a
949 // mechanism for checking for extensions to ARMv8.
950 
951 // ------------------------------ Shl
952 
954  const Vec128<uint8_t> bits) {
955  return Vec128<uint8_t>(vshlq_u8(v.raw, vreinterpretq_s8_u8(bits.raw)));
956 }
957 template <size_t N, HWY_IF_LE64(uint8_t, N)>
959  const Vec128<uint8_t, N> bits) {
960  return Vec128<uint8_t, N>(vshl_u8(v.raw, vreinterpret_s8_u8(bits.raw)));
961 }
962 
964  const Vec128<uint16_t> bits) {
965  return Vec128<uint16_t>(vshlq_u16(v.raw, vreinterpretq_s16_u16(bits.raw)));
966 }
967 template <size_t N, HWY_IF_LE64(uint16_t, N)>
969  const Vec128<uint16_t, N> bits) {
970  return Vec128<uint16_t, N>(vshl_u16(v.raw, vreinterpret_s16_u16(bits.raw)));
971 }
972 
974  const Vec128<uint32_t> bits) {
975  return Vec128<uint32_t>(vshlq_u32(v.raw, vreinterpretq_s32_u32(bits.raw)));
976 }
977 template <size_t N, HWY_IF_LE64(uint32_t, N)>
979  const Vec128<uint32_t, N> bits) {
980  return Vec128<uint32_t, N>(vshl_u32(v.raw, vreinterpret_s32_u32(bits.raw)));
981 }
982 
984  const Vec128<uint64_t> bits) {
985  return Vec128<uint64_t>(vshlq_u64(v.raw, vreinterpretq_s64_u64(bits.raw)));
986 }
988  const Vec64<uint64_t> bits) {
989  return Vec64<uint64_t>(vshl_u64(v.raw, vreinterpret_s64_u64(bits.raw)));
990 }
991 
993  const Vec128<int8_t> bits) {
994  return Vec128<int8_t>(vshlq_s8(v.raw, bits.raw));
995 }
996 template <size_t N, HWY_IF_LE64(int8_t, N)>
998  const Vec128<int8_t, N> bits) {
999  return Vec128<int8_t, N>(vshl_s8(v.raw, bits.raw));
1000 }
1001 
1003  const Vec128<int16_t> bits) {
1004  return Vec128<int16_t>(vshlq_s16(v.raw, bits.raw));
1005 }
1006 template <size_t N, HWY_IF_LE64(int16_t, N)>
1008  const Vec128<int16_t, N> bits) {
1009  return Vec128<int16_t, N>(vshl_s16(v.raw, bits.raw));
1010 }
1011 
1013  const Vec128<int32_t> bits) {
1014  return Vec128<int32_t>(vshlq_s32(v.raw, bits.raw));
1015 }
1016 template <size_t N, HWY_IF_LE64(int32_t, N)>
1018  const Vec128<int32_t, N> bits) {
1019  return Vec128<int32_t, N>(vshl_s32(v.raw, bits.raw));
1020 }
1021 
1023  const Vec128<int64_t> bits) {
1024  return Vec128<int64_t>(vshlq_s64(v.raw, bits.raw));
1025 }
1027  const Vec64<int64_t> bits) {
1028  return Vec64<int64_t>(vshl_s64(v.raw, bits.raw));
1029 }
1030 
1031 // ------------------------------ Shr (Neg)
1032 
1034  const Vec128<uint8_t> bits) {
1035  const int8x16_t neg_bits = Neg(BitCast(Full128<int8_t>(), bits)).raw;
1036  return Vec128<uint8_t>(vshlq_u8(v.raw, neg_bits));
1037 }
1038 template <size_t N, HWY_IF_LE64(uint8_t, N)>
1040  const Vec128<uint8_t, N> bits) {
1041  const int8x8_t neg_bits = Neg(BitCast(Simd<int8_t, N, 0>(), bits)).raw;
1042  return Vec128<uint8_t, N>(vshl_u8(v.raw, neg_bits));
1043 }
1044 
1046  const Vec128<uint16_t> bits) {
1047  const int16x8_t neg_bits = Neg(BitCast(Full128<int16_t>(), bits)).raw;
1048  return Vec128<uint16_t>(vshlq_u16(v.raw, neg_bits));
1049 }
1050 template <size_t N, HWY_IF_LE64(uint16_t, N)>
1052  const Vec128<uint16_t, N> bits) {
1053  const int16x4_t neg_bits = Neg(BitCast(Simd<int16_t, N, 0>(), bits)).raw;
1054  return Vec128<uint16_t, N>(vshl_u16(v.raw, neg_bits));
1055 }
1056 
1058  const Vec128<uint32_t> bits) {
1059  const int32x4_t neg_bits = Neg(BitCast(Full128<int32_t>(), bits)).raw;
1060  return Vec128<uint32_t>(vshlq_u32(v.raw, neg_bits));
1061 }
1062 template <size_t N, HWY_IF_LE64(uint32_t, N)>
1064  const Vec128<uint32_t, N> bits) {
1065  const int32x2_t neg_bits = Neg(BitCast(Simd<int32_t, N, 0>(), bits)).raw;
1066  return Vec128<uint32_t, N>(vshl_u32(v.raw, neg_bits));
1067 }
1068 
1070  const Vec128<uint64_t> bits) {
1071  const int64x2_t neg_bits = Neg(BitCast(Full128<int64_t>(), bits)).raw;
1072  return Vec128<uint64_t>(vshlq_u64(v.raw, neg_bits));
1073 }
1075  const Vec64<uint64_t> bits) {
1076  const int64x1_t neg_bits = Neg(BitCast(Full64<int64_t>(), bits)).raw;
1077  return Vec64<uint64_t>(vshl_u64(v.raw, neg_bits));
1078 }
1079 
1081  const Vec128<int8_t> bits) {
1082  return Vec128<int8_t>(vshlq_s8(v.raw, Neg(bits).raw));
1083 }
1084 template <size_t N, HWY_IF_LE64(int8_t, N)>
1086  const Vec128<int8_t, N> bits) {
1087  return Vec128<int8_t, N>(vshl_s8(v.raw, Neg(bits).raw));
1088 }
1089 
1091  const Vec128<int16_t> bits) {
1092  return Vec128<int16_t>(vshlq_s16(v.raw, Neg(bits).raw));
1093 }
1094 template <size_t N, HWY_IF_LE64(int16_t, N)>
1096  const Vec128<int16_t, N> bits) {
1097  return Vec128<int16_t, N>(vshl_s16(v.raw, Neg(bits).raw));
1098 }
1099 
1101  const Vec128<int32_t> bits) {
1102  return Vec128<int32_t>(vshlq_s32(v.raw, Neg(bits).raw));
1103 }
1104 template <size_t N, HWY_IF_LE64(int32_t, N)>
1106  const Vec128<int32_t, N> bits) {
1107  return Vec128<int32_t, N>(vshl_s32(v.raw, Neg(bits).raw));
1108 }
1109 
1111  const Vec128<int64_t> bits) {
1112  return Vec128<int64_t>(vshlq_s64(v.raw, Neg(bits).raw));
1113 }
1115  const Vec64<int64_t> bits) {
1116  return Vec64<int64_t>(vshl_s64(v.raw, Neg(bits).raw));
1117 }
1118 
1119 // ------------------------------ ShiftLeftSame (Shl)
1120 
1121 template <typename T, size_t N>
1123  return v << Set(Simd<T, N, 0>(), static_cast<T>(bits));
1124 }
1125 template <typename T, size_t N>
1127  return v >> Set(Simd<T, N, 0>(), static_cast<T>(bits));
1128 }
1129 
1130 // ------------------------------ Integer multiplication
1131 
1132 // Unsigned
1134  const Vec128<uint16_t> b) {
1135  return Vec128<uint16_t>(vmulq_u16(a.raw, b.raw));
1136 }
1138  const Vec128<uint32_t> b) {
1139  return Vec128<uint32_t>(vmulq_u32(a.raw, b.raw));
1140 }
1141 
1142 template <size_t N, HWY_IF_LE64(uint16_t, N)>
1144  const Vec128<uint16_t, N> b) {
1145  return Vec128<uint16_t, N>(vmul_u16(a.raw, b.raw));
1146 }
1147 template <size_t N, HWY_IF_LE64(uint32_t, N)>
1149  const Vec128<uint32_t, N> b) {
1150  return Vec128<uint32_t, N>(vmul_u32(a.raw, b.raw));
1151 }
1152 
1153 // Signed
1155  const Vec128<int16_t> b) {
1156  return Vec128<int16_t>(vmulq_s16(a.raw, b.raw));
1157 }
1159  const Vec128<int32_t> b) {
1160  return Vec128<int32_t>(vmulq_s32(a.raw, b.raw));
1161 }
1162 
1163 template <size_t N, HWY_IF_LE64(uint16_t, N)>
1165  const Vec128<int16_t, N> b) {
1166  return Vec128<int16_t, N>(vmul_s16(a.raw, b.raw));
1167 }
1168 template <size_t N, HWY_IF_LE64(int32_t, N)>
1170  const Vec128<int32_t, N> b) {
1171  return Vec128<int32_t, N>(vmul_s32(a.raw, b.raw));
1172 }
1173 
1174 // Returns the upper 16 bits of a * b in each lane.
1176  const Vec128<int16_t> b) {
1177  int32x4_t rlo = vmull_s16(vget_low_s16(a.raw), vget_low_s16(b.raw));
1178 #if HWY_ARCH_ARM_A64
1179  int32x4_t rhi = vmull_high_s16(a.raw, b.raw);
1180 #else
1181  int32x4_t rhi = vmull_s16(vget_high_s16(a.raw), vget_high_s16(b.raw));
1182 #endif
1183  return Vec128<int16_t>(
1184  vuzp2q_s16(vreinterpretq_s16_s32(rlo), vreinterpretq_s16_s32(rhi)));
1185 }
1187  const Vec128<uint16_t> b) {
1188  uint32x4_t rlo = vmull_u16(vget_low_u16(a.raw), vget_low_u16(b.raw));
1189 #if HWY_ARCH_ARM_A64
1190  uint32x4_t rhi = vmull_high_u16(a.raw, b.raw);
1191 #else
1192  uint32x4_t rhi = vmull_u16(vget_high_u16(a.raw), vget_high_u16(b.raw));
1193 #endif
1194  return Vec128<uint16_t>(
1195  vuzp2q_u16(vreinterpretq_u16_u32(rlo), vreinterpretq_u16_u32(rhi)));
1196 }
1197 
1198 template <size_t N, HWY_IF_LE64(int16_t, N)>
1200  const Vec128<int16_t, N> b) {
1201  int16x8_t hi_lo = vreinterpretq_s16_s32(vmull_s16(a.raw, b.raw));
1202  return Vec128<int16_t, N>(vget_low_s16(vuzp2q_s16(hi_lo, hi_lo)));
1203 }
1204 template <size_t N, HWY_IF_LE64(uint16_t, N)>
1206  const Vec128<uint16_t, N> b) {
1207  uint16x8_t hi_lo = vreinterpretq_u16_u32(vmull_u16(a.raw, b.raw));
1208  return Vec128<uint16_t, N>(vget_low_u16(vuzp2q_u16(hi_lo, hi_lo)));
1209 }
1210 
1212  return Vec128<int16_t>(vqrdmulhq_s16(a.raw, b.raw));
1213 }
1214 template <size_t N, HWY_IF_LE64(int16_t, N)>
1216  Vec128<int16_t, N> b) {
1217  return Vec128<int16_t, N>(vqrdmulh_s16(a.raw, b.raw));
1218 }
1219 
1220 // ------------------------------ Floating-point mul / div
1221 
1222 HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator*, vmul, _, 2)
1223 
1224 // Approximate reciprocal
1226  return Vec128<float>(vrecpeq_f32(v.raw));
1227 }
1228 template <size_t N>
1230  return Vec128<float, N>(vrecpe_f32(v.raw));
1231 }
1232 
1233 #if HWY_ARCH_ARM_A64
1234 HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator/, vdiv, _, 2)
1235 #else
1236 // Not defined on armv7: approximate
1237 namespace detail {
1238 
1240  const Vec128<float> recip, const Vec128<float> divisor) {
1241  return Vec128<float>(vrecpsq_f32(recip.raw, divisor.raw));
1242 }
1243 template <size_t N>
1245  const Vec128<float, N> recip, Vec128<float, N> divisor) {
1246  return Vec128<float, N>(vrecps_f32(recip.raw, divisor.raw));
1247 }
1248 
1249 } // namespace detail
1250 
1251 template <size_t N>
1253  const Vec128<float, N> b) {
1254  auto x = ApproximateReciprocal(b);
1258  return a * x;
1259 }
1260 #endif
1261 
1262 // ------------------------------ Absolute value of difference.
1263 
1265  return Vec128<float>(vabdq_f32(a.raw, b.raw));
1266 }
1267 template <size_t N, HWY_IF_LE64(float, N)>
1269  const Vec128<float, N> b) {
1270  return Vec128<float, N>(vabd_f32(a.raw, b.raw));
1271 }
1272 
1273 // ------------------------------ Floating-point multiply-add variants
1274 
1275 // Returns add + mul * x
1276 #if defined(__ARM_VFPV4__) || HWY_ARCH_ARM_A64
1277 template <size_t N, HWY_IF_LE64(float, N)>
1278 HWY_API Vec128<float, N> MulAdd(const Vec128<float, N> mul,
1279  const Vec128<float, N> x,
1280  const Vec128<float, N> add) {
1281  return Vec128<float, N>(vfma_f32(add.raw, mul.raw, x.raw));
1282 }
1283 HWY_API Vec128<float> MulAdd(const Vec128<float> mul, const Vec128<float> x,
1284  const Vec128<float> add) {
1285  return Vec128<float>(vfmaq_f32(add.raw, mul.raw, x.raw));
1286 }
1287 #else
1288 // Emulate FMA for floats.
1289 template <size_t N>
1291  const Vec128<float, N> x,
1292  const Vec128<float, N> add) {
1293  return mul * x + add;
1294 }
1295 #endif
1296 
1297 #if HWY_ARCH_ARM_A64
1298 HWY_API Vec64<double> MulAdd(const Vec64<double> mul, const Vec64<double> x,
1299  const Vec64<double> add) {
1300  return Vec64<double>(vfma_f64(add.raw, mul.raw, x.raw));
1301 }
1302 HWY_API Vec128<double> MulAdd(const Vec128<double> mul, const Vec128<double> x,
1303  const Vec128<double> add) {
1304  return Vec128<double>(vfmaq_f64(add.raw, mul.raw, x.raw));
1305 }
1306 #endif
1307 
1308 // Returns add - mul * x
1309 #if defined(__ARM_VFPV4__) || HWY_ARCH_ARM_A64
1310 template <size_t N, HWY_IF_LE64(float, N)>
1311 HWY_API Vec128<float, N> NegMulAdd(const Vec128<float, N> mul,
1312  const Vec128<float, N> x,
1313  const Vec128<float, N> add) {
1314  return Vec128<float, N>(vfms_f32(add.raw, mul.raw, x.raw));
1315 }
1316 HWY_API Vec128<float> NegMulAdd(const Vec128<float> mul, const Vec128<float> x,
1317  const Vec128<float> add) {
1318  return Vec128<float>(vfmsq_f32(add.raw, mul.raw, x.raw));
1319 }
1320 #else
1321 // Emulate FMA for floats.
1322 template <size_t N>
1324  const Vec128<float, N> x,
1325  const Vec128<float, N> add) {
1326  return add - mul * x;
1327 }
1328 #endif
1329 
1330 #if HWY_ARCH_ARM_A64
1331 HWY_API Vec64<double> NegMulAdd(const Vec64<double> mul, const Vec64<double> x,
1332  const Vec64<double> add) {
1333  return Vec64<double>(vfms_f64(add.raw, mul.raw, x.raw));
1334 }
1335 HWY_API Vec128<double> NegMulAdd(const Vec128<double> mul,
1336  const Vec128<double> x,
1337  const Vec128<double> add) {
1338  return Vec128<double>(vfmsq_f64(add.raw, mul.raw, x.raw));
1339 }
1340 #endif
1341 
1342 // Returns mul * x - sub
1343 template <size_t N>
1345  const Vec128<float, N> x,
1346  const Vec128<float, N> sub) {
1347  return MulAdd(mul, x, Neg(sub));
1348 }
1349 
1350 // Returns -mul * x - sub
1351 template <size_t N>
1353  const Vec128<float, N> x,
1354  const Vec128<float, N> sub) {
1355  return Neg(MulAdd(mul, x, sub));
1356 }
1357 
1358 #if HWY_ARCH_ARM_A64
1359 template <size_t N>
1360 HWY_API Vec128<double, N> MulSub(const Vec128<double, N> mul,
1361  const Vec128<double, N> x,
1362  const Vec128<double, N> sub) {
1363  return MulAdd(mul, x, Neg(sub));
1364 }
1365 template <size_t N>
1366 HWY_API Vec128<double, N> NegMulSub(const Vec128<double, N> mul,
1367  const Vec128<double, N> x,
1368  const Vec128<double, N> sub) {
1369  return Neg(MulAdd(mul, x, sub));
1370 }
1371 #endif
1372 
1373 // ------------------------------ Floating-point square root (IfThenZeroElse)
1374 
1375 // Approximate reciprocal square root
1377  return Vec128<float>(vrsqrteq_f32(v.raw));
1378 }
1379 template <size_t N>
1381  return Vec128<float, N>(vrsqrte_f32(v.raw));
1382 }
1383 
1384 // Full precision square root
1385 #if HWY_ARCH_ARM_A64
1387 #else
1388 namespace detail {
1389 
1391  const Vec128<float> recip) {
1392  return Vec128<float>(vrsqrtsq_f32(root.raw, recip.raw));
1393 }
1394 template <size_t N>
1396  Vec128<float, N> recip) {
1397  return Vec128<float, N>(vrsqrts_f32(root.raw, recip.raw));
1398 }
1399 
1400 } // namespace detail
1401 
1402 // Not defined on armv7: approximate
1403 template <size_t N>
1405  auto recip = ApproximateReciprocalSqrt(v);
1406 
1407  recip *= detail::ReciprocalSqrtStep(v * recip, recip);
1408  recip *= detail::ReciprocalSqrtStep(v * recip, recip);
1409  recip *= detail::ReciprocalSqrtStep(v * recip, recip);
1410 
1411  const auto root = v * recip;
1412  return IfThenZeroElse(v == Zero(Simd<float, N, 0>()), root);
1413 }
1414 #endif
1415 
1416 // ================================================== LOGICAL
1417 
1418 // ------------------------------ Not
1419 
1420 // There is no 64-bit vmvn, so cast instead of using HWY_NEON_DEF_FUNCTION.
1421 template <typename T>
1423  const Full128<T> d;
1424  const Repartition<uint8_t, decltype(d)> d8;
1425  return BitCast(d, Vec128<uint8_t>(vmvnq_u8(BitCast(d8, v).raw)));
1426 }
1427 template <typename T, size_t N, HWY_IF_LE64(T, N)>
1429  const Simd<T, N, 0> d;
1430  const Repartition<uint8_t, decltype(d)> d8;
1431  using V8 = decltype(Zero(d8));
1432  return BitCast(d, V8(vmvn_u8(BitCast(d8, v).raw)));
1433 }
1434 
1435 // ------------------------------ And
1437 
1438 // Uses the u32/64 defined above.
1439 template <typename T, size_t N, HWY_IF_FLOAT(T)>
1440 HWY_API Vec128<T, N> And(const Vec128<T, N> a, const Vec128<T, N> b) {
1441  const DFromV<decltype(a)> d;
1442  const RebindToUnsigned<decltype(d)> du;
1443  return BitCast(d, BitCast(du, a) & BitCast(du, b));
1444 }
1445 
1446 // ------------------------------ AndNot
1447 
1448 namespace detail {
1449 // reversed_andnot returns a & ~b.
1450 HWY_NEON_DEF_FUNCTION_INTS_UINTS(reversed_andnot, vbic, _, 2)
1451 } // namespace detail
1452 
1453 // Returns ~not_mask & mask.
1454 template <typename T, size_t N, HWY_IF_NOT_FLOAT(T)>
1456  const Vec128<T, N> mask) {
1457  return detail::reversed_andnot(mask, not_mask);
1458 }
1459 
1460 // Uses the u32/64 defined above.
1461 template <typename T, size_t N, HWY_IF_FLOAT(T)>
1462 HWY_API Vec128<T, N> AndNot(const Vec128<T, N> not_mask,
1463  const Vec128<T, N> mask) {
1464  const DFromV<decltype(mask)> d;
1465  const RebindToUnsigned<decltype(d)> du;
1466  VFromD<decltype(du)> ret =
1467  detail::reversed_andnot(BitCast(du, mask), BitCast(du, not_mask));
1468  return BitCast(d, ret);
1469 }
1470 
1471 // ------------------------------ Or
1472 
1474 
1475 // Uses the u32/64 defined above.
1476 template <typename T, size_t N, HWY_IF_FLOAT(T)>
1477 HWY_API Vec128<T, N> Or(const Vec128<T, N> a, const Vec128<T, N> b) {
1478  const DFromV<decltype(a)> d;
1479  const RebindToUnsigned<decltype(d)> du;
1480  return BitCast(d, BitCast(du, a) | BitCast(du, b));
1481 }
1482 
1483 // ------------------------------ Xor
1484 
1486 
1487 // Uses the u32/64 defined above.
1488 template <typename T, size_t N, HWY_IF_FLOAT(T)>
1489 HWY_API Vec128<T, N> Xor(const Vec128<T, N> a, const Vec128<T, N> b) {
1490  const DFromV<decltype(a)> d;
1491  const RebindToUnsigned<decltype(d)> du;
1492  return BitCast(d, BitCast(du, a) ^ BitCast(du, b));
1493 }
1494 
1495 // ------------------------------ OrAnd
1496 
1497 template <typename T, size_t N>
1499  return Or(o, And(a1, a2));
1500 }
1501 
1502 // ------------------------------ IfVecThenElse
1503 
1504 template <typename T, size_t N>
1506  Vec128<T, N> no) {
1507  return IfThenElse(MaskFromVec(mask), yes, no);
1508 }
1509 
1510 // ------------------------------ Operator overloads (internal-only if float)
1511 
1512 template <typename T, size_t N>
1514  return And(a, b);
1515 }
1516 
1517 template <typename T, size_t N>
1519  return Or(a, b);
1520 }
1521 
1522 template <typename T, size_t N>
1524  return Xor(a, b);
1525 }
1526 
1527 // ------------------------------ PopulationCount
1528 
1529 #ifdef HWY_NATIVE_POPCNT
1530 #undef HWY_NATIVE_POPCNT
1531 #else
1532 #define HWY_NATIVE_POPCNT
1533 #endif
1534 
1535 namespace detail {
1536 
1537 template <typename T>
1539  const Full128<uint8_t> d8;
1540  return Vec128<T>(vcntq_u8(BitCast(d8, v).raw));
1541 }
1542 template <typename T, size_t N, HWY_IF_LE64(T, N)>
1544  Vec128<T, N> v) {
1545  const Simd<uint8_t, N, 0> d8;
1546  return Vec128<T, N>(vcnt_u8(BitCast(d8, v).raw));
1547 }
1548 
1549 // ARM lacks popcount for lane sizes > 1, so take pairwise sums of the bytes.
1550 template <typename T>
1552  const Full128<uint8_t> d8;
1553  const uint8x16_t bytes = vcntq_u8(BitCast(d8, v).raw);
1554  return Vec128<T>(vpaddlq_u8(bytes));
1555 }
1556 template <typename T, size_t N, HWY_IF_LE64(T, N)>
1558  Vec128<T, N> v) {
1560  const uint8x8_t bytes = vcnt_u8(BitCast(d8, v).raw);
1561  return Vec128<T, N>(vpaddl_u8(bytes));
1562 }
1563 
1564 template <typename T>
1566  const Full128<uint8_t> d8;
1567  const uint8x16_t bytes = vcntq_u8(BitCast(d8, v).raw);
1568  return Vec128<T>(vpaddlq_u16(vpaddlq_u8(bytes)));
1569 }
1570 template <typename T, size_t N, HWY_IF_LE64(T, N)>
1572  Vec128<T, N> v) {
1574  const uint8x8_t bytes = vcnt_u8(BitCast(d8, v).raw);
1575  return Vec128<T, N>(vpaddl_u16(vpaddl_u8(bytes)));
1576 }
1577 
1578 template <typename T>
1580  const Full128<uint8_t> d8;
1581  const uint8x16_t bytes = vcntq_u8(BitCast(d8, v).raw);
1582  return Vec128<T>(vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(bytes))));
1583 }
1584 template <typename T, size_t N, HWY_IF_LE64(T, N)>
1586  Vec128<T, N> v) {
1588  const uint8x8_t bytes = vcnt_u8(BitCast(d8, v).raw);
1589  return Vec128<T, N>(vpaddl_u32(vpaddl_u16(vpaddl_u8(bytes))));
1590 }
1591 
1592 } // namespace detail
1593 
1594 template <typename T, size_t N, HWY_IF_NOT_FLOAT(T)>
1596  return detail::PopulationCount(hwy::SizeTag<sizeof(T)>(), v);
1597 }
1598 
1599 // ================================================== SIGN
1600 
1601 // ------------------------------ Abs
1602 
1603 // Returns absolute value, except that LimitsMin() maps to LimitsMax() + 1.
1605  return Vec128<int8_t>(vabsq_s8(v.raw));
1606 }
1608  return Vec128<int16_t>(vabsq_s16(v.raw));
1609 }
1611  return Vec128<int32_t>(vabsq_s32(v.raw));
1612 }
1613 // i64 is implemented after BroadcastSignBit.
1615  return Vec128<float>(vabsq_f32(v.raw));
1616 }
1617 
1618 template <size_t N, HWY_IF_LE64(int8_t, N)>
1620  return Vec128<int8_t, N>(vabs_s8(v.raw));
1621 }
1622 template <size_t N, HWY_IF_LE64(int16_t, N)>
1624  return Vec128<int16_t, N>(vabs_s16(v.raw));
1625 }
1626 template <size_t N, HWY_IF_LE64(int32_t, N)>
1628  return Vec128<int32_t, N>(vabs_s32(v.raw));
1629 }
1630 template <size_t N, HWY_IF_LE64(float, N)>
1632  return Vec128<float, N>(vabs_f32(v.raw));
1633 }
1634 
1635 #if HWY_ARCH_ARM_A64
1636 HWY_API Vec128<double> Abs(const Vec128<double> v) {
1637  return Vec128<double>(vabsq_f64(v.raw));
1638 }
1639 
1640 HWY_API Vec64<double> Abs(const Vec64<double> v) {
1641  return Vec64<double>(vabs_f64(v.raw));
1642 }
1643 #endif
1644 
1645 // ------------------------------ CopySign
1646 
1647 template <typename T, size_t N>
1649  const Vec128<T, N> sign) {
1650  static_assert(IsFloat<T>(), "Only makes sense for floating-point");
1651  const auto msb = SignBit(Simd<T, N, 0>());
1652  return Or(AndNot(msb, magn), And(msb, sign));
1653 }
1654 
1655 template <typename T, size_t N>
1657  const Vec128<T, N> sign) {
1658  static_assert(IsFloat<T>(), "Only makes sense for floating-point");
1659  return Or(abs, And(SignBit(Simd<T, N, 0>()), sign));
1660 }
1661 
1662 // ------------------------------ BroadcastSignBit
1663 
1664 template <typename T, size_t N, HWY_IF_SIGNED(T)>
1666  return ShiftRight<sizeof(T) * 8 - 1>(v);
1667 }
1668 
1669 // ================================================== MASK
1670 
1671 // ------------------------------ To/from vector
1672 
1673 // Mask and Vec have the same representation (true = FF..FF).
1674 template <typename T, size_t N>
1676  const Simd<MakeUnsigned<T>, N, 0> du;
1677  return Mask128<T, N>(BitCast(du, v).raw);
1678 }
1679 
1680 template <typename T, size_t N>
1682  return BitCast(d, Vec128<MakeUnsigned<T>, N>(v.raw));
1683 }
1684 
1685 // ------------------------------ RebindMask
1686 
1687 template <typename TFrom, typename TTo, size_t N>
1689  static_assert(sizeof(TFrom) == sizeof(TTo), "Must have same size");
1690  return MaskFromVec(BitCast(dto, VecFromMask(Simd<TFrom, N, 0>(), m)));
1691 }
1692 
1693 // ------------------------------ IfThenElse(mask, yes, no) = mask ? b : a.
1694 
1695 #define HWY_NEON_BUILD_TPL_HWY_IF
1696 #define HWY_NEON_BUILD_RET_HWY_IF(type, size) Vec128<type##_t, size>
1697 #define HWY_NEON_BUILD_PARAM_HWY_IF(type, size) \
1698  const Mask128<type##_t, size> mask, const Vec128<type##_t, size> yes, \
1699  const Vec128<type##_t, size> no
1700 #define HWY_NEON_BUILD_ARG_HWY_IF mask.raw, yes.raw, no.raw
1701 
1703 
1704 #undef HWY_NEON_BUILD_TPL_HWY_IF
1705 #undef HWY_NEON_BUILD_RET_HWY_IF
1706 #undef HWY_NEON_BUILD_PARAM_HWY_IF
1707 #undef HWY_NEON_BUILD_ARG_HWY_IF
1708 
1709 // mask ? yes : 0
1710 template <typename T, size_t N>
1712  const Vec128<T, N> yes) {
1713  return yes & VecFromMask(Simd<T, N, 0>(), mask);
1714 }
1715 
1716 // mask ? 0 : no
1717 template <typename T, size_t N>
1719  const Vec128<T, N> no) {
1720  return AndNot(VecFromMask(Simd<T, N, 0>(), mask), no);
1721 }
1722 
1723 template <typename T, size_t N>
1725  Vec128<T, N> no) {
1726  static_assert(IsSigned<T>(), "Only works for signed/float");
1727  const Simd<T, N, 0> d;
1728  const RebindToSigned<decltype(d)> di;
1729 
1731  return IfThenElse(m, yes, no);
1732 }
1733 
1734 template <typename T, size_t N>
1736  const auto zero = Zero(Simd<T, N, 0>());
1737  return Max(zero, v);
1738 }
1739 
1740 // ------------------------------ Mask logical
1741 
1742 template <typename T, size_t N>
1744  return MaskFromVec(Not(VecFromMask(Simd<T, N, 0>(), m)));
1745 }
1746 
1747 template <typename T, size_t N>
1749  const Simd<T, N, 0> d;
1750  return MaskFromVec(And(VecFromMask(d, a), VecFromMask(d, b)));
1751 }
1752 
1753 template <typename T, size_t N>
1755  const Simd<T, N, 0> d;
1756  return MaskFromVec(AndNot(VecFromMask(d, a), VecFromMask(d, b)));
1757 }
1758 
1759 template <typename T, size_t N>
1761  const Simd<T, N, 0> d;
1762  return MaskFromVec(Or(VecFromMask(d, a), VecFromMask(d, b)));
1763 }
1764 
1765 template <typename T, size_t N>
1767  const Simd<T, N, 0> d;
1768  return MaskFromVec(Xor(VecFromMask(d, a), VecFromMask(d, b)));
1769 }
1770 
1771 // ================================================== COMPARE
1772 
1773 // Comparisons fill a lane with 1-bits if the condition is true, else 0.
1774 
1775 // ------------------------------ Shuffle2301 (for i64 compares)
1776 
1777 // Swap 32-bit halves in 64-bits
1779  return Vec64<uint32_t>(vrev64_u32(v.raw));
1780 }
1782  return Vec64<int32_t>(vrev64_s32(v.raw));
1783 }
1785  return Vec64<float>(vrev64_f32(v.raw));
1786 }
1788  return Vec128<uint32_t>(vrev64q_u32(v.raw));
1789 }
1791  return Vec128<int32_t>(vrev64q_s32(v.raw));
1792 }
1794  return Vec128<float>(vrev64q_f32(v.raw));
1795 }
1796 
1797 #define HWY_NEON_BUILD_TPL_HWY_COMPARE
1798 #define HWY_NEON_BUILD_RET_HWY_COMPARE(type, size) Mask128<type##_t, size>
1799 #define HWY_NEON_BUILD_PARAM_HWY_COMPARE(type, size) \
1800  const Vec128<type##_t, size> a, const Vec128<type##_t, size> b
1801 #define HWY_NEON_BUILD_ARG_HWY_COMPARE a.raw, b.raw
1802 
1803 // ------------------------------ Equality
1804 HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator==, vceq, _, HWY_COMPARE)
1805 #if HWY_ARCH_ARM_A64
1806 HWY_NEON_DEF_FUNCTION_INTS_UINTS(operator==, vceq, _, HWY_COMPARE)
1807 #else
1808 // No 64-bit comparisons on armv7: emulate them below, after Shuffle2301.
1809 HWY_NEON_DEF_FUNCTION_INT_8_16_32(operator==, vceq, _, HWY_COMPARE)
1810 HWY_NEON_DEF_FUNCTION_UINT_8_16_32(operator==, vceq, _, HWY_COMPARE)
1811 #endif
1812 
1813 // ------------------------------ Inequality
1814 template <typename T, size_t N>
1816  return Not(a == b);
1817 }
1818 
1819 // ------------------------------ Strict inequality (signed, float)
1820 #if HWY_ARCH_ARM_A64
1821 HWY_NEON_DEF_FUNCTION_INTS_UINTS(operator<, vclt, _, HWY_COMPARE)
1822 #else
1823 HWY_NEON_DEF_FUNCTION_UINT_8_16_32(operator<, vclt, _, HWY_COMPARE)
1824 HWY_NEON_DEF_FUNCTION_INT_8_16_32(operator<, vclt, _, HWY_COMPARE)
1825 #endif
1826 HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator<, vclt, _, HWY_COMPARE)
1827 
1828 // ------------------------------ Weak inequality (float)
1829 HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator<=, vcle, _, HWY_COMPARE)
1830 
1831 #undef HWY_NEON_BUILD_TPL_HWY_COMPARE
1832 #undef HWY_NEON_BUILD_RET_HWY_COMPARE
1833 #undef HWY_NEON_BUILD_PARAM_HWY_COMPARE
1834 #undef HWY_NEON_BUILD_ARG_HWY_COMPARE
1835 
1836 // ------------------------------ ARMv7 i64 compare (Shuffle2301, Eq)
1837 
1838 #if HWY_ARCH_ARM_V7
1839 
1840 template <size_t N>
1841 HWY_API Mask128<int64_t, N> operator==(const Vec128<int64_t, N> a,
1842  const Vec128<int64_t, N> b) {
1843  const Simd<int32_t, N * 2, 0> d32;
1844  const Simd<int64_t, N, 0> d64;
1845  const auto cmp32 = VecFromMask(d32, Eq(BitCast(d32, a), BitCast(d32, b)));
1846  const auto cmp64 = cmp32 & Shuffle2301(cmp32);
1847  return MaskFromVec(BitCast(d64, cmp64));
1848 }
1849 
1850 template <size_t N>
1851 HWY_API Mask128<uint64_t, N> operator==(const Vec128<uint64_t, N> a,
1852  const Vec128<uint64_t, N> b) {
1853  const Simd<uint32_t, N * 2, 0> d32;
1854  const Simd<uint64_t, N, 0> d64;
1855  const auto cmp32 = VecFromMask(d32, Eq(BitCast(d32, a), BitCast(d32, b)));
1856  const auto cmp64 = cmp32 & Shuffle2301(cmp32);
1857  return MaskFromVec(BitCast(d64, cmp64));
1858 }
1859 
1860 HWY_API Mask128<int64_t> operator<(const Vec128<int64_t> a,
1861  const Vec128<int64_t> b) {
1862  const int64x2_t sub = vqsubq_s64(a.raw, b.raw);
1863  return MaskFromVec(BroadcastSignBit(Vec128<int64_t>(sub)));
1864 }
1865 HWY_API Mask128<int64_t, 1> operator<(const Vec64<int64_t> a,
1866  const Vec64<int64_t> b) {
1867  const int64x1_t sub = vqsub_s64(a.raw, b.raw);
1868  return MaskFromVec(BroadcastSignBit(Vec64<int64_t>(sub)));
1869 }
1870 
1871 template <size_t N>
1872 HWY_API Mask128<uint64_t, N> operator<(const Vec128<uint64_t, N> a,
1873  const Vec128<uint64_t, N> b) {
1874  const DFromV<decltype(a)> du;
1875  const RebindToSigned<decltype(du)> di;
1876  const Vec128<uint64_t, N> msb = AndNot(a, b) | AndNot(a ^ b, a - b);
1877  return MaskFromVec(BitCast(du, BroadcastSignBit(BitCast(di, msb))));
1878 }
1879 
1880 #endif
1881 
1882 // ------------------------------ Reversed comparisons
1883 
1884 template <typename T, size_t N>
1886  return operator<(b, a);
1887 }
1888 template <typename T, size_t N>
1890  return operator<=(b, a);
1891 }
1892 
1893 // ------------------------------ FirstN (Iota, Lt)
1894 
1895 template <typename T, size_t N>
1897  const RebindToSigned<decltype(d)> di; // Signed comparisons are cheaper.
1898  return RebindMask(d, Iota(di, 0) < Set(di, static_cast<MakeSigned<T>>(num)));
1899 }
1900 
1901 // ------------------------------ TestBit (Eq)
1902 
1903 #define HWY_NEON_BUILD_TPL_HWY_TESTBIT
1904 #define HWY_NEON_BUILD_RET_HWY_TESTBIT(type, size) Mask128<type##_t, size>
1905 #define HWY_NEON_BUILD_PARAM_HWY_TESTBIT(type, size) \
1906  Vec128<type##_t, size> v, Vec128<type##_t, size> bit
1907 #define HWY_NEON_BUILD_ARG_HWY_TESTBIT v.raw, bit.raw
1908 
1909 #if HWY_ARCH_ARM_A64
1910 HWY_NEON_DEF_FUNCTION_INTS_UINTS(TestBit, vtst, _, HWY_TESTBIT)
1911 #else
1912 // No 64-bit versions on armv7
1913 HWY_NEON_DEF_FUNCTION_UINT_8_16_32(TestBit, vtst, _, HWY_TESTBIT)
1914 HWY_NEON_DEF_FUNCTION_INT_8_16_32(TestBit, vtst, _, HWY_TESTBIT)
1915 
1916 template <size_t N>
1918  Vec128<uint64_t, N> bit) {
1919  return (v & bit) == bit;
1920 }
1921 template <size_t N>
1923  Vec128<int64_t, N> bit) {
1924  return (v & bit) == bit;
1925 }
1926 
1927 #endif
1928 #undef HWY_NEON_BUILD_TPL_HWY_TESTBIT
1929 #undef HWY_NEON_BUILD_RET_HWY_TESTBIT
1930 #undef HWY_NEON_BUILD_PARAM_HWY_TESTBIT
1931 #undef HWY_NEON_BUILD_ARG_HWY_TESTBIT
1932 
1933 // ------------------------------ Abs i64 (IfThenElse, BroadcastSignBit)
1935 #if HWY_ARCH_ARM_A64
1936  return Vec128<int64_t>(vabsq_s64(v.raw));
1937 #else
1938  const auto zero = Zero(Full128<int64_t>());
1939  return IfThenElse(MaskFromVec(BroadcastSignBit(v)), zero - v, v);
1940 #endif
1941 }
1943 #if HWY_ARCH_ARM_A64
1944  return Vec64<int64_t>(vabs_s64(v.raw));
1945 #else
1946  const auto zero = Zero(Full64<int64_t>());
1947  return IfThenElse(MaskFromVec(BroadcastSignBit(v)), zero - v, v);
1948 #endif
1949 }
1950 
1951 // ------------------------------ Min (IfThenElse, BroadcastSignBit)
1952 
1953 // Unsigned
1955 
1956 template <size_t N>
1957 HWY_API Vec128<uint64_t, N> Min(const Vec128<uint64_t, N> a,
1958  const Vec128<uint64_t, N> b) {
1959 #if HWY_ARCH_ARM_A64
1960  return IfThenElse(b < a, b, a);
1961 #else
1962  const DFromV<decltype(a)> du;
1963  const RebindToSigned<decltype(du)> di;
1964  return BitCast(du, BitCast(di, a) - BitCast(di, detail::SaturatedSub(a, b)));
1965 #endif
1966 }
1967 
1968 // Signed
1970 
1971 template <size_t N>
1972 HWY_API Vec128<int64_t, N> Min(const Vec128<int64_t, N> a,
1973  const Vec128<int64_t, N> b) {
1974 #if HWY_ARCH_ARM_A64
1975  return IfThenElse(b < a, b, a);
1976 #else
1977  const Vec128<int64_t, N> sign = detail::SaturatedSub(a, b);
1978  return IfThenElse(MaskFromVec(BroadcastSignBit(sign)), a, b);
1979 #endif
1980 }
1981 
1982 // Float: IEEE minimumNumber on v8, otherwise NaN if any is NaN.
1983 #if HWY_ARCH_ARM_A64
1985 #else
1987 #endif
1988 
1989 // ------------------------------ Max (IfThenElse, BroadcastSignBit)
1990 
1991 // Unsigned (no u64)
1993 
1994 template <size_t N>
1995 HWY_API Vec128<uint64_t, N> Max(const Vec128<uint64_t, N> a,
1996  const Vec128<uint64_t, N> b) {
1997 #if HWY_ARCH_ARM_A64
1998  return IfThenElse(b < a, a, b);
1999 #else
2000  const DFromV<decltype(a)> du;
2001  const RebindToSigned<decltype(du)> di;
2002  return BitCast(du, BitCast(di, b) + BitCast(di, detail::SaturatedSub(a, b)));
2003 #endif
2004 }
2005 
2006 // Signed (no i64)
2008 
2009 template <size_t N>
2010 HWY_API Vec128<int64_t, N> Max(const Vec128<int64_t, N> a,
2011  const Vec128<int64_t, N> b) {
2012 #if HWY_ARCH_ARM_A64
2013  return IfThenElse(b < a, a, b);
2014 #else
2015  const Vec128<int64_t, N> sign = detail::SaturatedSub(a, b);
2016  return IfThenElse(MaskFromVec(BroadcastSignBit(sign)), b, a);
2017 #endif
2018 }
2019 
2020 // Float: IEEE maximumNumber on v8, otherwise NaN if any is NaN.
2021 #if HWY_ARCH_ARM_A64
2023 #else
2025 #endif
2026 
2027 // ================================================== MEMORY
2028 
2029 // ------------------------------ Load 128
2030 
2032  const uint8_t* HWY_RESTRICT unaligned) {
2033  return Vec128<uint8_t>(vld1q_u8(unaligned));
2034 }
2036  const uint16_t* HWY_RESTRICT unaligned) {
2037  return Vec128<uint16_t>(vld1q_u16(unaligned));
2038 }
2040  const uint32_t* HWY_RESTRICT unaligned) {
2041  return Vec128<uint32_t>(vld1q_u32(unaligned));
2042 }
2044  const uint64_t* HWY_RESTRICT unaligned) {
2045  return Vec128<uint64_t>(vld1q_u64(unaligned));
2046 }
2048  const int8_t* HWY_RESTRICT unaligned) {
2049  return Vec128<int8_t>(vld1q_s8(unaligned));
2050 }
2052  const int16_t* HWY_RESTRICT unaligned) {
2053  return Vec128<int16_t>(vld1q_s16(unaligned));
2054 }
2056  const int32_t* HWY_RESTRICT unaligned) {
2057  return Vec128<int32_t>(vld1q_s32(unaligned));
2058 }
2060  const int64_t* HWY_RESTRICT unaligned) {
2061  return Vec128<int64_t>(vld1q_s64(unaligned));
2062 }
2064  const float* HWY_RESTRICT unaligned) {
2065  return Vec128<float>(vld1q_f32(unaligned));
2066 }
2067 #if HWY_ARCH_ARM_A64
2068 HWY_API Vec128<double> LoadU(Full128<double> /* tag */,
2069  const double* HWY_RESTRICT unaligned) {
2070  return Vec128<double>(vld1q_f64(unaligned));
2071 }
2072 #endif
2073 
2074 // ------------------------------ Load 64
2075 
2077  const uint8_t* HWY_RESTRICT p) {
2078  return Vec64<uint8_t>(vld1_u8(p));
2079 }
2081  const uint16_t* HWY_RESTRICT p) {
2082  return Vec64<uint16_t>(vld1_u16(p));
2083 }
2085  const uint32_t* HWY_RESTRICT p) {
2086  return Vec64<uint32_t>(vld1_u32(p));
2087 }
2089  const uint64_t* HWY_RESTRICT p) {
2090  return Vec64<uint64_t>(vld1_u64(p));
2091 }
2093  const int8_t* HWY_RESTRICT p) {
2094  return Vec64<int8_t>(vld1_s8(p));
2095 }
2097  const int16_t* HWY_RESTRICT p) {
2098  return Vec64<int16_t>(vld1_s16(p));
2099 }
2101  const int32_t* HWY_RESTRICT p) {
2102  return Vec64<int32_t>(vld1_s32(p));
2103 }
2105  const int64_t* HWY_RESTRICT p) {
2106  return Vec64<int64_t>(vld1_s64(p));
2107 }
2109  const float* HWY_RESTRICT p) {
2110  return Vec64<float>(vld1_f32(p));
2111 }
2112 #if HWY_ARCH_ARM_A64
2113 HWY_API Vec64<double> LoadU(Full64<double> /* tag */,
2114  const double* HWY_RESTRICT p) {
2115  return Vec64<double>(vld1_f64(p));
2116 }
2117 #endif
2118 
2119 // ------------------------------ Load 32
2120 
2122  const uint8_t* HWY_RESTRICT p) {
2123  uint32x2_t a = vld1_dup_u32(reinterpret_cast<const uint32_t*>(p));
2124  return Vec32<uint8_t>(vreinterpret_u8_u32(a));
2125 }
2127  const uint16_t* HWY_RESTRICT p) {
2128  uint32x2_t a = vld1_dup_u32(reinterpret_cast<const uint32_t*>(p));
2129  return Vec32<uint16_t>(vreinterpret_u16_u32(a));
2130 }
2132  const uint32_t* HWY_RESTRICT p) {
2133  return Vec32<uint32_t>(vld1_dup_u32(reinterpret_cast<const uint32_t*>(p)));
2134 }
2136  const int8_t* HWY_RESTRICT p) {
2137  int32x2_t a = vld1_dup_s32(reinterpret_cast<const int32_t*>(p));
2138  return Vec32<int8_t>(vreinterpret_s8_s32(a));
2139 }
2141  const int16_t* HWY_RESTRICT p) {
2142  int32x2_t a = vld1_dup_s32(reinterpret_cast<const int32_t*>(p));
2143  return Vec32<int16_t>(vreinterpret_s16_s32(a));
2144 }
2146  const int32_t* HWY_RESTRICT p) {
2147  return Vec32<int32_t>(vld1_dup_s32(reinterpret_cast<const int32_t*>(p)));
2148 }
2150  return Vec32<float>(vld1_dup_f32(p));
2151 }
2152 
2153 // ------------------------------ Load 16
2154 
2156  const uint8_t* HWY_RESTRICT p) {
2157  uint16x4_t a = vld1_dup_u16(reinterpret_cast<const uint16_t*>(p));
2158  return Vec128<uint8_t, 2>(vreinterpret_u8_u16(a));
2159 }
2161  const uint16_t* HWY_RESTRICT p) {
2162  return Vec128<uint16_t, 1>(
2163  vld1_dup_u16(reinterpret_cast<const uint16_t*>(p)));
2164 }
2166  const int8_t* HWY_RESTRICT p) {
2167  int16x4_t a = vld1_dup_s16(reinterpret_cast<const int16_t*>(p));
2168  return Vec128<int8_t, 2>(vreinterpret_s8_s16(a));
2169 }
2171  const int16_t* HWY_RESTRICT p) {
2172  return Vec128<int16_t, 1>(vld1_dup_s16(reinterpret_cast<const int16_t*>(p)));
2173 }
2174 
2175 // ------------------------------ Load 8
2176 
2178  const uint8_t* HWY_RESTRICT p) {
2179  return Vec128<uint8_t, 1>(vld1_dup_u8(p));
2180 }
2181 
2183  const int8_t* HWY_RESTRICT p) {
2184  return Vec128<int8_t, 1>(vld1_dup_s8(p));
2185 }
2186 
2187 // [b]float16_t use the same Raw as uint16_t, so forward to that.
2188 template <size_t N>
2190  const float16_t* HWY_RESTRICT p) {
2191  const RebindToUnsigned<decltype(d)> du16;
2192  const auto pu16 = reinterpret_cast<const uint16_t*>(p);
2193  return Vec128<float16_t, N>(LoadU(du16, pu16).raw);
2194 }
2195 template <size_t N>
2197  const bfloat16_t* HWY_RESTRICT p) {
2198  const RebindToUnsigned<decltype(d)> du16;
2199  const auto pu16 = reinterpret_cast<const uint16_t*>(p);
2200  return Vec128<bfloat16_t, N>(LoadU(du16, pu16).raw);
2201 }
2202 
2203 // On ARM, Load is the same as LoadU.
2204 template <typename T, size_t N>
2206  return LoadU(d, p);
2207 }
2208 
2209 template <typename T, size_t N>
2211  const T* HWY_RESTRICT aligned) {
2212  return IfThenElseZero(m, Load(d, aligned));
2213 }
2214 
2215 // 128-bit SIMD => nothing to duplicate, same as an unaligned load.
2216 template <typename T, size_t N, HWY_IF_LE128(T, N)>
2218  const T* const HWY_RESTRICT p) {
2219  return LoadU(d, p);
2220 }
2221 
2222 // ------------------------------ Store 128
2223 
2225  uint8_t* HWY_RESTRICT unaligned) {
2226  vst1q_u8(unaligned, v.raw);
2227 }
2229  uint16_t* HWY_RESTRICT unaligned) {
2230  vst1q_u16(unaligned, v.raw);
2231 }
2233  uint32_t* HWY_RESTRICT unaligned) {
2234  vst1q_u32(unaligned, v.raw);
2235 }
2237  uint64_t* HWY_RESTRICT unaligned) {
2238  vst1q_u64(unaligned, v.raw);
2239 }
2241  int8_t* HWY_RESTRICT unaligned) {
2242  vst1q_s8(unaligned, v.raw);
2243 }
2245  int16_t* HWY_RESTRICT unaligned) {
2246  vst1q_s16(unaligned, v.raw);
2247 }
2249  int32_t* HWY_RESTRICT unaligned) {
2250  vst1q_s32(unaligned, v.raw);
2251 }
2253  int64_t* HWY_RESTRICT unaligned) {
2254  vst1q_s64(unaligned, v.raw);
2255 }
2257  float* HWY_RESTRICT unaligned) {
2258  vst1q_f32(unaligned, v.raw);
2259 }
2260 #if HWY_ARCH_ARM_A64
2261 HWY_API void StoreU(const Vec128<double> v, Full128<double> /* tag */,
2262  double* HWY_RESTRICT unaligned) {
2263  vst1q_f64(unaligned, v.raw);
2264 }
2265 #endif
2266 
2267 // ------------------------------ Store 64
2268 
2270  uint8_t* HWY_RESTRICT p) {
2271  vst1_u8(p, v.raw);
2272 }
2274  uint16_t* HWY_RESTRICT p) {
2275  vst1_u16(p, v.raw);
2276 }
2278  uint32_t* HWY_RESTRICT p) {
2279  vst1_u32(p, v.raw);
2280 }
2282  uint64_t* HWY_RESTRICT p) {
2283  vst1_u64(p, v.raw);
2284 }
2286  int8_t* HWY_RESTRICT p) {
2287  vst1_s8(p, v.raw);
2288 }
2290  int16_t* HWY_RESTRICT p) {
2291  vst1_s16(p, v.raw);
2292 }
2294  int32_t* HWY_RESTRICT p) {
2295  vst1_s32(p, v.raw);
2296 }
2298  int64_t* HWY_RESTRICT p) {
2299  vst1_s64(p, v.raw);
2300 }
2302  float* HWY_RESTRICT p) {
2303  vst1_f32(p, v.raw);
2304 }
2305 #if HWY_ARCH_ARM_A64
2306 HWY_API void StoreU(const Vec64<double> v, Full64<double> /* tag */,
2307  double* HWY_RESTRICT p) {
2308  vst1_f64(p, v.raw);
2309 }
2310 #endif
2311 
2312 // ------------------------------ Store 32
2313 
2315  uint8_t* HWY_RESTRICT p) {
2316  uint32x2_t a = vreinterpret_u32_u8(v.raw);
2317  vst1_lane_u32(reinterpret_cast<uint32_t*>(p), a, 0);
2318 }
2320  uint16_t* HWY_RESTRICT p) {
2321  uint32x2_t a = vreinterpret_u32_u16(v.raw);
2322  vst1_lane_u32(reinterpret_cast<uint32_t*>(p), a, 0);
2323 }
2325  uint32_t* HWY_RESTRICT p) {
2326  vst1_lane_u32(p, v.raw, 0);
2327 }
2329  int8_t* HWY_RESTRICT p) {
2330  int32x2_t a = vreinterpret_s32_s8(v.raw);
2331  vst1_lane_s32(reinterpret_cast<int32_t*>(p), a, 0);
2332 }
2334  int16_t* HWY_RESTRICT p) {
2335  int32x2_t a = vreinterpret_s32_s16(v.raw);
2336  vst1_lane_s32(reinterpret_cast<int32_t*>(p), a, 0);
2337 }
2339  int32_t* HWY_RESTRICT p) {
2340  vst1_lane_s32(p, v.raw, 0);
2341 }
2343  float* HWY_RESTRICT p) {
2344  vst1_lane_f32(p, v.raw, 0);
2345 }
2346 
2347 // ------------------------------ Store 16
2348 
2350  uint8_t* HWY_RESTRICT p) {
2351  uint16x4_t a = vreinterpret_u16_u8(v.raw);
2352  vst1_lane_u16(reinterpret_cast<uint16_t*>(p), a, 0);
2353 }
2355  uint16_t* HWY_RESTRICT p) {
2356  vst1_lane_u16(p, v.raw, 0);
2357 }
2359  int8_t* HWY_RESTRICT p) {
2360  int16x4_t a = vreinterpret_s16_s8(v.raw);
2361  vst1_lane_s16(reinterpret_cast<int16_t*>(p), a, 0);
2362 }
2364  int16_t* HWY_RESTRICT p) {
2365  vst1_lane_s16(p, v.raw, 0);
2366 }
2367 
2368 // ------------------------------ Store 8
2369 
2371  uint8_t* HWY_RESTRICT p) {
2372  vst1_lane_u8(p, v.raw, 0);
2373 }
2375  int8_t* HWY_RESTRICT p) {
2376  vst1_lane_s8(p, v.raw, 0);
2377 }
2378 
2379 // [b]float16_t use the same Raw as uint16_t, so forward to that.
2380 template <size_t N>
2382  float16_t* HWY_RESTRICT p) {
2383  const RebindToUnsigned<decltype(d)> du16;
2384  const auto pu16 = reinterpret_cast<uint16_t*>(p);
2385  return StoreU(Vec128<uint16_t, N>(v.raw), du16, pu16);
2386 }
2387 template <size_t N>
2389  bfloat16_t* HWY_RESTRICT p) {
2390  const RebindToUnsigned<decltype(d)> du16;
2391  const auto pu16 = reinterpret_cast<uint16_t*>(p);
2392  return StoreU(Vec128<uint16_t, N>(v.raw), du16, pu16);
2393 }
2394 
2395 // On ARM, Store is the same as StoreU.
2396 template <typename T, size_t N>
2398  StoreU(v, d, aligned);
2399 }
2400 
2401 template <typename T, size_t N>
2403  T* HWY_RESTRICT p) {
2404  // Treat as unsigned so that we correctly support float16.
2405  const RebindToUnsigned<decltype(d)> du;
2406  const auto blended =
2407  IfThenElse(RebindMask(du, m), BitCast(du, v), BitCast(du, LoadU(d, p)));
2408  StoreU(BitCast(d, blended), d, p);
2409 }
2410 
2411 // ------------------------------ Non-temporal stores
2412 
2413 // Same as aligned stores on non-x86.
2414 
2415 template <typename T, size_t N>
2417  T* HWY_RESTRICT aligned) {
2418  Store(v, d, aligned);
2419 }
2420 
2421 // ================================================== CONVERT
2422 
2423 // ------------------------------ Promotions (part w/ narrow lanes -> full)
2424 
2425 // Unsigned: zero-extend to full vector.
2427  const Vec64<uint8_t> v) {
2428  return Vec128<uint16_t>(vmovl_u8(v.raw));
2429 }
2431  const Vec32<uint8_t> v) {
2432  uint16x8_t a = vmovl_u8(v.raw);
2433  return Vec128<uint32_t>(vmovl_u16(vget_low_u16(a)));
2434 }
2436  const Vec64<uint16_t> v) {
2437  return Vec128<uint32_t>(vmovl_u16(v.raw));
2438 }
2440  const Vec64<uint32_t> v) {
2441  return Vec128<uint64_t>(vmovl_u32(v.raw));
2442 }
2444  return BitCast(d, Vec128<uint16_t>(vmovl_u8(v.raw)));
2445 }
2447  uint16x8_t a = vmovl_u8(v.raw);
2448  return BitCast(d, Vec128<uint32_t>(vmovl_u16(vget_low_u16(a))));
2449 }
2451  return BitCast(d, Vec128<uint32_t>(vmovl_u16(v.raw)));
2452 }
2453 
2454 // Unsigned: zero-extend to half vector.
2455 template <size_t N, HWY_IF_LE64(uint16_t, N)>
2457  const Vec128<uint8_t, N> v) {
2458  return Vec128<uint16_t, N>(vget_low_u16(vmovl_u8(v.raw)));
2459 }
2460 template <size_t N, HWY_IF_LE64(uint32_t, N)>
2462  const Vec128<uint8_t, N> v) {
2463  uint16x8_t a = vmovl_u8(v.raw);
2464  return Vec128<uint32_t, N>(vget_low_u32(vmovl_u16(vget_low_u16(a))));
2465 }
2466 template <size_t N>
2468  const Vec128<uint16_t, N> v) {
2469  return Vec128<uint32_t, N>(vget_low_u32(vmovl_u16(v.raw)));
2470 }
2471 template <size_t N, HWY_IF_LE64(uint64_t, N)>
2473  const Vec128<uint32_t, N> v) {
2474  return Vec128<uint64_t, N>(vget_low_u64(vmovl_u32(v.raw)));
2475 }
2476 template <size_t N, HWY_IF_LE64(int16_t, N)>
2478  const Vec128<uint8_t, N> v) {
2479  return BitCast(d, Vec128<uint16_t, N>(vget_low_u16(vmovl_u8(v.raw))));
2480 }
2481 template <size_t N, HWY_IF_LE64(int32_t, N)>
2483  const Vec128<uint8_t, N> v) {
2484  uint16x8_t a = vmovl_u8(v.raw);
2485  uint32x4_t b = vmovl_u16(vget_low_u16(a));
2486  return Vec128<int32_t, N>(vget_low_s32(vreinterpretq_s32_u32(b)));
2487 }
2488 template <size_t N, HWY_IF_LE64(int32_t, N)>
2490  const Vec128<uint16_t, N> v) {
2491  uint32x4_t a = vmovl_u16(v.raw);
2492  return Vec128<int32_t, N>(vget_low_s32(vreinterpretq_s32_u32(a)));
2493 }
2494 
2495 // Signed: replicate sign bit to full vector.
2497  const Vec64<int8_t> v) {
2498  return Vec128<int16_t>(vmovl_s8(v.raw));
2499 }
2501  const Vec32<int8_t> v) {
2502  int16x8_t a = vmovl_s8(v.raw);
2503  return Vec128<int32_t>(vmovl_s16(vget_low_s16(a)));
2504 }
2506  const Vec64<int16_t> v) {
2507  return Vec128<int32_t>(vmovl_s16(v.raw));
2508 }
2510  const Vec64<int32_t> v) {
2511  return Vec128<int64_t>(vmovl_s32(v.raw));
2512 }
2513 
2514 // Signed: replicate sign bit to half vector.
2515 template <size_t N>
2517  const Vec128<int8_t, N> v) {
2518  return Vec128<int16_t, N>(vget_low_s16(vmovl_s8(v.raw)));
2519 }
2520 template <size_t N>
2522  const Vec128<int8_t, N> v) {
2523  int16x8_t a = vmovl_s8(v.raw);
2524  int32x4_t b = vmovl_s16(vget_low_s16(a));
2525  return Vec128<int32_t, N>(vget_low_s32(b));
2526 }
2527 template <size_t N>
2529  const Vec128<int16_t, N> v) {
2530  return Vec128<int32_t, N>(vget_low_s32(vmovl_s16(v.raw)));
2531 }
2532 template <size_t N>
2534  const Vec128<int32_t, N> v) {
2535  return Vec128<int64_t, N>(vget_low_s64(vmovl_s32(v.raw)));
2536 }
2537 
2538 #if __ARM_FP & 2
2539 
2540 HWY_API Vec128<float> PromoteTo(Full128<float> /* tag */,
2541  const Vec128<float16_t, 4> v) {
2542  const float32x4_t f32 = vcvt_f32_f16(vreinterpret_f16_u16(v.raw));
2543  return Vec128<float>(f32);
2544 }
2545 template <size_t N>
2546 HWY_API Vec128<float, N> PromoteTo(Simd<float, N, 0> /* tag */,
2547  const Vec128<float16_t, N> v) {
2548  const float32x4_t f32 = vcvt_f32_f16(vreinterpret_f16_u16(v.raw));
2549  return Vec128<float, N>(vget_low_f32(f32));
2550 }
2551 
2552 #else
2553 
2554 template <size_t N>
2556  const Vec128<float16_t, N> v) {
2557  const RebindToSigned<decltype(df32)> di32;
2558  const RebindToUnsigned<decltype(df32)> du32;
2559  // Expand to u32 so we can shift.
2560  const auto bits16 = PromoteTo(du32, Vec128<uint16_t, N>{v.raw});
2561  const auto sign = ShiftRight<15>(bits16);
2562  const auto biased_exp = ShiftRight<10>(bits16) & Set(du32, 0x1F);
2563  const auto mantissa = bits16 & Set(du32, 0x3FF);
2564  const auto subnormal =
2565  BitCast(du32, ConvertTo(df32, BitCast(di32, mantissa)) *
2566  Set(df32, 1.0f / 16384 / 1024));
2567 
2568  const auto biased_exp32 = biased_exp + Set(du32, 127 - 15);
2569  const auto mantissa32 = ShiftLeft<23 - 10>(mantissa);
2570  const auto normal = ShiftLeft<23>(biased_exp32) | mantissa32;
2571  const auto bits32 = IfThenElse(biased_exp == Zero(du32), subnormal, normal);
2572  return BitCast(df32, ShiftLeft<31>(sign) | bits32);
2573 }
2574 
2575 #endif
2576 
2577 #if HWY_ARCH_ARM_A64
2578 
2579 HWY_API Vec128<double> PromoteTo(Full128<double> /* tag */,
2580  const Vec64<float> v) {
2581  return Vec128<double>(vcvt_f64_f32(v.raw));
2582 }
2583 
2584 HWY_API Vec64<double> PromoteTo(Full64<double> /* tag */,
2585  const Vec32<float> v) {
2586  return Vec64<double>(vget_low_f64(vcvt_f64_f32(v.raw)));
2587 }
2588 
2589 HWY_API Vec128<double> PromoteTo(Full128<double> /* tag */,
2590  const Vec64<int32_t> v) {
2591  const int64x2_t i64 = vmovl_s32(v.raw);
2592  return Vec128<double>(vcvtq_f64_s64(i64));
2593 }
2594 
2595 HWY_API Vec64<double> PromoteTo(Full64<double> /* tag */,
2596  const Vec32<int32_t> v) {
2597  const int64x1_t i64 = vget_low_s64(vmovl_s32(v.raw));
2598  return Vec64<double>(vcvt_f64_s64(i64));
2599 }
2600 
2601 #endif
2602 
2603 // ------------------------------ Demotions (full -> part w/ narrow lanes)
2604 
2605 // From full vector to half or quarter
2607  const Vec128<int32_t> v) {
2608  return Vec64<uint16_t>(vqmovun_s32(v.raw));
2609 }
2611  const Vec128<int32_t> v) {
2612  return Vec64<int16_t>(vqmovn_s32(v.raw));
2613 }
2615  const Vec128<int32_t> v) {
2616  const uint16x4_t a = vqmovun_s32(v.raw);
2617  return Vec32<uint8_t>(vqmovn_u16(vcombine_u16(a, a)));
2618 }
2620  const Vec128<int16_t> v) {
2621  return Vec64<uint8_t>(vqmovun_s16(v.raw));
2622 }
2624  const Vec128<int32_t> v) {
2625  const int16x4_t a = vqmovn_s32(v.raw);
2626  return Vec32<int8_t>(vqmovn_s16(vcombine_s16(a, a)));
2627 }
2629  const Vec128<int16_t> v) {
2630  return Vec64<int8_t>(vqmovn_s16(v.raw));
2631 }
2632 
2633 // From half vector to partial half
2634 template <size_t N, HWY_IF_LE64(int32_t, N)>
2636  const Vec128<int32_t, N> v) {
2637  return Vec128<uint16_t, N>(vqmovun_s32(vcombine_s32(v.raw, v.raw)));
2638 }
2639 template <size_t N, HWY_IF_LE64(int32_t, N)>
2641  const Vec128<int32_t, N> v) {
2642  return Vec128<int16_t, N>(vqmovn_s32(vcombine_s32(v.raw, v.raw)));
2643 }
2644 template <size_t N, HWY_IF_LE64(int32_t, N)>
2646  const Vec128<int32_t, N> v) {
2647  const uint16x4_t a = vqmovun_s32(vcombine_s32(v.raw, v.raw));
2648  return Vec128<uint8_t, N>(vqmovn_u16(vcombine_u16(a, a)));
2649 }
2650 template <size_t N, HWY_IF_LE64(int16_t, N)>
2652  const Vec128<int16_t, N> v) {
2653  return Vec128<uint8_t, N>(vqmovun_s16(vcombine_s16(v.raw, v.raw)));
2654 }
2655 template <size_t N, HWY_IF_LE64(int32_t, N)>
2657  const Vec128<int32_t, N> v) {
2658  const int16x4_t a = vqmovn_s32(vcombine_s32(v.raw, v.raw));
2659  return Vec128<int8_t, N>(vqmovn_s16(vcombine_s16(a, a)));
2660 }
2661 template <size_t N, HWY_IF_LE64(int16_t, N)>
2663  const Vec128<int16_t, N> v) {
2664  return Vec128<int8_t, N>(vqmovn_s16(vcombine_s16(v.raw, v.raw)));
2665 }
2666 
2667 #if __ARM_FP & 2
2668 
2669 HWY_API Vec128<float16_t, 4> DemoteTo(Full64<float16_t> /* tag */,
2670  const Vec128<float> v) {
2671  return Vec128<float16_t, 4>{vreinterpret_u16_f16(vcvt_f16_f32(v.raw))};
2672 }
2673 template <size_t N>
2674 HWY_API Vec128<float16_t, N> DemoteTo(Simd<float16_t, N, 0> /* tag */,
2675  const Vec128<float, N> v) {
2676  const float16x4_t f16 = vcvt_f16_f32(vcombine_f32(v.raw, v.raw));
2677  return Vec128<float16_t, N>(vreinterpret_u16_f16(f16));
2678 }
2679 
2680 #else
2681 
2682 template <size_t N>
2684  const Vec128<float, N> v) {
2685  const RebindToUnsigned<decltype(df16)> du16;
2686  const Rebind<uint32_t, decltype(du16)> du;
2687  const RebindToSigned<decltype(du)> di;
2688  const auto bits32 = BitCast(du, v);
2689  const auto sign = ShiftRight<31>(bits32);
2690  const auto biased_exp32 = ShiftRight<23>(bits32) & Set(du, 0xFF);
2691  const auto mantissa32 = bits32 & Set(du, 0x7FFFFF);
2692 
2693  const auto k15 = Set(di, 15);
2694  const auto exp = Min(BitCast(di, biased_exp32) - Set(di, 127), k15);
2695  const auto is_tiny = exp < Set(di, -24);
2696 
2697  const auto is_subnormal = exp < Set(di, -14);
2698  const auto biased_exp16 =
2699  BitCast(du, IfThenZeroElse(is_subnormal, exp + k15));
2700  const auto sub_exp = BitCast(du, Set(di, -14) - exp); // [1, 11)
2701  const auto sub_m = (Set(du, 1) << (Set(du, 10) - sub_exp)) +
2702  (mantissa32 >> (Set(du, 13) + sub_exp));
2703  const auto mantissa16 = IfThenElse(RebindMask(du, is_subnormal), sub_m,
2704  ShiftRight<13>(mantissa32)); // <1024
2705 
2706  const auto sign16 = ShiftLeft<15>(sign);
2707  const auto normal16 = sign16 | ShiftLeft<10>(biased_exp16) | mantissa16;
2708  const auto bits16 = IfThenZeroElse(is_tiny, BitCast(di, normal16));
2709  return Vec128<float16_t, N>(DemoteTo(du16, bits16).raw);
2710 }
2711 
2712 #endif
2713 
2714 template <size_t N>
2716  const Vec128<float, N> v) {
2717  const Rebind<int32_t, decltype(dbf16)> di32;
2718  const Rebind<uint32_t, decltype(dbf16)> du32; // for logical shift right
2719  const Rebind<uint16_t, decltype(dbf16)> du16;
2720  const auto bits_in_32 = BitCast(di32, ShiftRight<16>(BitCast(du32, v)));
2721  return BitCast(dbf16, DemoteTo(du16, bits_in_32));
2722 }
2723 
2724 #if HWY_ARCH_ARM_A64
2725 
2726 HWY_API Vec64<float> DemoteTo(Full64<float> /* tag */, const Vec128<double> v) {
2727  return Vec64<float>(vcvt_f32_f64(v.raw));
2728 }
2729 HWY_API Vec32<float> DemoteTo(Full32<float> /* tag */, const Vec64<double> v) {
2730  return Vec32<float>(vcvt_f32_f64(vcombine_f64(v.raw, v.raw)));
2731 }
2732 
2733 HWY_API Vec64<int32_t> DemoteTo(Full64<int32_t> /* tag */,
2734  const Vec128<double> v) {
2735  const int64x2_t i64 = vcvtq_s64_f64(v.raw);
2736  return Vec64<int32_t>(vqmovn_s64(i64));
2737 }
2738 HWY_API Vec32<int32_t> DemoteTo(Full32<int32_t> /* tag */,
2739  const Vec64<double> v) {
2740  const int64x1_t i64 = vcvt_s64_f64(v.raw);
2741  // There is no i64x1 -> i32x1 narrow, so expand to int64x2_t first.
2742  const int64x2_t i64x2 = vcombine_s64(i64, i64);
2743  return Vec32<int32_t>(vqmovn_s64(i64x2));
2744 }
2745 
2746 #endif
2747 
2749  const uint8x16_t org_v = detail::BitCastToByte(v).raw;
2750  const uint8x16_t w = vuzp1q_u8(org_v, org_v);
2751  return Vec32<uint8_t>(vget_low_u8(vuzp1q_u8(w, w)));
2752 }
2753 template <size_t N, HWY_IF_LE64(uint32_t, N)>
2755  const uint8x8_t org_v = detail::BitCastToByte(v).raw;
2756  const uint8x8_t w = vuzp1_u8(org_v, org_v);
2757  return Vec128<uint8_t, N>(vuzp1_u8(w, w));
2758 }
2759 
2760 // In the following DemoteTo functions, |b| is purposely undefined.
2761 // The value a needs to be extended to 128 bits so that vqmovn can be
2762 // used and |b| is undefined so that no extra overhead is introduced.
2763 HWY_DIAGNOSTICS(push)
2764 HWY_DIAGNOSTICS_OFF(disable : 4701, ignored "-Wuninitialized")
2765 
2766 template <size_t N>
2768  const Vec128<int32_t> v) {
2771  uint16x8_t c = vcombine_u16(a.raw, b.raw);
2772  return Vec128<uint8_t, N>(vqmovn_u16(c));
2773 }
2774 
2775 template <size_t N>
2777  const Vec128<int32_t> v) {
2780  int16x8_t c = vcombine_s16(a.raw, b.raw);
2781  return Vec128<int8_t, N>(vqmovn_s16(c));
2782 }
2783 
2784 HWY_DIAGNOSTICS(pop)
2785 
2786 // ------------------------------ Convert integer <=> floating-point
2787 
2788 HWY_API Vec128<float> ConvertTo(Full128<float> /* tag */,
2789  const Vec128<int32_t> v) {
2790  return Vec128<float>(vcvtq_f32_s32(v.raw));
2791 }
2792 template <size_t N, HWY_IF_LE64(int32_t, N)>
2794  const Vec128<int32_t, N> v) {
2795  return Vec128<float, N>(vcvt_f32_s32(v.raw));
2796 }
2797 
2798 // Truncates (rounds toward zero).
2800  const Vec128<float> v) {
2801  return Vec128<int32_t>(vcvtq_s32_f32(v.raw));
2802 }
2803 template <size_t N, HWY_IF_LE64(float, N)>
2805  const Vec128<float, N> v) {
2806  return Vec128<int32_t, N>(vcvt_s32_f32(v.raw));
2807 }
2808 
2809 #if HWY_ARCH_ARM_A64
2810 
2811 HWY_API Vec128<double> ConvertTo(Full128<double> /* tag */,
2812  const Vec128<int64_t> v) {
2813  return Vec128<double>(vcvtq_f64_s64(v.raw));
2814 }
2815 HWY_API Vec64<double> ConvertTo(Full64<double> /* tag */,
2816  const Vec64<int64_t> v) {
2817  return Vec64<double>(vcvt_f64_s64(v.raw));
2818 }
2819 
2820 // Truncates (rounds toward zero).
2821 HWY_API Vec128<int64_t> ConvertTo(Full128<int64_t> /* tag */,
2822  const Vec128<double> v) {
2823  return Vec128<int64_t>(vcvtq_s64_f64(v.raw));
2824 }
2825 HWY_API Vec64<int64_t> ConvertTo(Full64<int64_t> /* tag */,
2826  const Vec64<double> v) {
2827  return Vec64<int64_t>(vcvt_s64_f64(v.raw));
2828 }
2829 
2830 #endif
2831 
2832 // ------------------------------ Round (IfThenElse, mask, logical)
2833 
2834 #if HWY_ARCH_ARM_A64
2835 // Toward nearest integer
2837 
2838 // Toward zero, aka truncate
2840 
2841 // Toward +infinity, aka ceiling
2843 
2844 // Toward -infinity, aka floor
2846 #else
2847 
2848 // ------------------------------ Trunc
2849 
2850 // ARMv7 only supports truncation to integer. We can either convert back to
2851 // float (3 floating-point and 2 logic operations) or manipulate the binary32
2852 // representation, clearing the lowest 23-exp mantissa bits. This requires 9
2853 // integer operations and 3 constants, which is likely more expensive.
2854 
2855 namespace detail {
2856 
2857 // The original value is already the desired result if NaN or the magnitude is
2858 // large (i.e. the value is already an integer).
2859 template <size_t N>
2861  return Abs(v) < Set(Simd<float, N, 0>(), MantissaEnd<float>());
2862 }
2863 
2864 } // namespace detail
2865 
2866 template <size_t N>
2868  const DFromV<decltype(v)> df;
2869  const RebindToSigned<decltype(df)> di;
2870 
2871  const auto integer = ConvertTo(di, v); // round toward 0
2872  const auto int_f = ConvertTo(df, integer);
2873 
2874  return IfThenElse(detail::UseInt(v), int_f, v);
2875 }
2876 
2877 template <size_t N>
2879  const DFromV<decltype(v)> df;
2880 
2881  // ARMv7 also lacks a native NearestInt, but we can instead rely on rounding
2882  // (we assume the current mode is nearest-even) after addition with a large
2883  // value such that no mantissa bits remain. We may need a compiler flag for
2884  // precise floating-point to prevent this from being "optimized" out.
2885  const auto max = Set(df, MantissaEnd<float>());
2886  const auto large = CopySignToAbs(max, v);
2887  const auto added = large + v;
2888  const auto rounded = added - large;
2889 
2890  // Keep original if NaN or the magnitude is large (already an int).
2891  return IfThenElse(Abs(v) < max, rounded, v);
2892 }
2893 
2894 template <size_t N>
2896  const DFromV<decltype(v)> df;
2897  const RebindToSigned<decltype(df)> di;
2898 
2899  const auto integer = ConvertTo(di, v); // round toward 0
2900  const auto int_f = ConvertTo(df, integer);
2901 
2902  // Truncating a positive non-integer ends up smaller; if so, add 1.
2903  const auto neg1 = ConvertTo(df, VecFromMask(di, RebindMask(di, int_f < v)));
2904 
2905  return IfThenElse(detail::UseInt(v), int_f - neg1, v);
2906 }
2907 
2908 template <size_t N>
2910  const DFromV<decltype(v)> df;
2911  const RebindToSigned<decltype(df)> di;
2912 
2913  const auto integer = ConvertTo(di, v); // round toward 0
2914  const auto int_f = ConvertTo(df, integer);
2915 
2916  // Truncating a negative non-integer ends up larger; if so, subtract 1.
2917  const auto neg1 = ConvertTo(df, VecFromMask(di, RebindMask(di, int_f > v)));
2918 
2919  return IfThenElse(detail::UseInt(v), int_f + neg1, v);
2920 }
2921 
2922 #endif
2923 
2924 // ------------------------------ NearestInt (Round)
2925 
2926 #if HWY_ARCH_ARM_A64
2927 
2928 HWY_API Vec128<int32_t> NearestInt(const Vec128<float> v) {
2929  return Vec128<int32_t>(vcvtnq_s32_f32(v.raw));
2930 }
2931 template <size_t N, HWY_IF_LE64(float, N)>
2932 HWY_API Vec128<int32_t, N> NearestInt(const Vec128<float, N> v) {
2933  return Vec128<int32_t, N>(vcvtn_s32_f32(v.raw));
2934 }
2935 
2936 #else
2937 
2938 template <size_t N>
2940  const RebindToSigned<DFromV<decltype(v)>> di;
2941  return ConvertTo(di, Round(v));
2942 }
2943 
2944 #endif
2945 
2946 // ================================================== SWIZZLE
2947 
2948 // ------------------------------ LowerHalf
2949 
2950 // <= 64 bit: just return different type
2951 template <typename T, size_t N, HWY_IF_LE64(uint8_t, N)>
2953  return Vec128<T, N / 2>(v.raw);
2954 }
2955 
2957  return Vec64<uint8_t>(vget_low_u8(v.raw));
2958 }
2960  return Vec64<uint16_t>(vget_low_u16(v.raw));
2961 }
2963  return Vec64<uint32_t>(vget_low_u32(v.raw));
2964 }
2966  return Vec64<uint64_t>(vget_low_u64(v.raw));
2967 }
2969  return Vec64<int8_t>(vget_low_s8(v.raw));
2970 }
2972  return Vec64<int16_t>(vget_low_s16(v.raw));
2973 }
2975  return Vec64<int32_t>(vget_low_s32(v.raw));
2976 }
2978  return Vec64<int64_t>(vget_low_s64(v.raw));
2979 }
2981  return Vec64<float>(vget_low_f32(v.raw));
2982 }
2983 #if HWY_ARCH_ARM_A64
2984 HWY_API Vec64<double> LowerHalf(const Vec128<double> v) {
2985  return Vec64<double>(vget_low_f64(v.raw));
2986 }
2987 #endif
2988 
2989 template <typename T, size_t N>
2991  Vec128<T, N> v) {
2992  return LowerHalf(v);
2993 }
2994 
2995 // ------------------------------ CombineShiftRightBytes
2996 
2997 // 128-bit
2998 template <int kBytes, typename T, class V128 = Vec128<T>>
2999 HWY_API V128 CombineShiftRightBytes(Full128<T> d, V128 hi, V128 lo) {
3000  static_assert(0 < kBytes && kBytes < 16, "kBytes must be in [1, 15]");
3001  const Repartition<uint8_t, decltype(d)> d8;
3002  uint8x16_t v8 = vextq_u8(BitCast(d8, lo).raw, BitCast(d8, hi).raw, kBytes);
3003  return BitCast(d, Vec128<uint8_t>(v8));
3004 }
3005 
3006 // 64-bit
3007 template <int kBytes, typename T>
3009  static_assert(0 < kBytes && kBytes < 8, "kBytes must be in [1, 7]");
3010  const Repartition<uint8_t, decltype(d)> d8;
3011  uint8x8_t v8 = vext_u8(BitCast(d8, lo).raw, BitCast(d8, hi).raw, kBytes);
3012  return BitCast(d, VFromD<decltype(d8)>(v8));
3013 }
3014 
3015 // <= 32-bit defined after ShiftLeftBytes.
3016 
3017 // ------------------------------ Shift vector by constant #bytes
3018 
3019 namespace detail {
3020 
3021 // Partially specialize because kBytes = 0 and >= size are compile errors;
3022 // callers replace the latter with 0xFF for easier specialization.
3023 template <int kBytes>
3025  // Full
3026  template <class T>
3028  const Full128<T> d;
3029  return CombineShiftRightBytes<16 - kBytes>(d, v, Zero(d));
3030  }
3031 
3032  // Partial
3033  template <class T, size_t N, HWY_IF_LE64(T, N)>
3035  // Expand to 64-bit so we only use the native EXT instruction.
3036  const Full64<T> d64;
3037  const auto zero64 = Zero(d64);
3038  const decltype(zero64) v64(v.raw);
3039  return Vec128<T, N>(
3040  CombineShiftRightBytes<8 - kBytes>(d64, v64, zero64).raw);
3041  }
3042 };
3043 template <>
3044 struct ShiftLeftBytesT<0> {
3045  template <class T, size_t N>
3047  return v;
3048  }
3049 };
3050 template <>
3051 struct ShiftLeftBytesT<0xFF> {
3052  template <class T, size_t N>
3054  return Zero(Simd<T, N, 0>());
3055  }
3056 };
3057 
3058 template <int kBytes>
3060  template <class T, size_t N>
3062  const Simd<T, N, 0> d;
3063  // For < 64-bit vectors, zero undefined lanes so we shift in zeros.
3064  if (N * sizeof(T) < 8) {
3065  constexpr size_t kReg = N * sizeof(T) == 16 ? 16 : 8;
3066  const Simd<T, kReg / sizeof(T), 0> dreg;
3067  v = Vec128<T, N>(
3068  IfThenElseZero(FirstN(dreg, N), VFromD<decltype(dreg)>(v.raw)).raw);
3069  }
3070  return CombineShiftRightBytes<kBytes>(d, Zero(d), v);
3071  }
3072 };
3073 template <>
3074 struct ShiftRightBytesT<0> {
3075  template <class T, size_t N>
3077  return v;
3078  }
3079 };
3080 template <>
3081 struct ShiftRightBytesT<0xFF> {
3082  template <class T, size_t N>
3084  return Zero(Simd<T, N, 0>());
3085  }
3086 };
3087 
3088 } // namespace detail
3089 
3090 template <int kBytes, typename T, size_t N>
3092  return detail::ShiftLeftBytesT < kBytes >= N * sizeof(T) ? 0xFF
3093  : kBytes > ()(v);
3094 }
3095 
3096 template <int kBytes, typename T, size_t N>
3098  return ShiftLeftBytes<kBytes>(Simd<T, N, 0>(), v);
3099 }
3100 
3101 template <int kLanes, typename T, size_t N>
3103  const Repartition<uint8_t, decltype(d)> d8;
3104  return BitCast(d, ShiftLeftBytes<kLanes * sizeof(T)>(BitCast(d8, v)));
3105 }
3106 
3107 template <int kLanes, typename T, size_t N>
3109  return ShiftLeftLanes<kLanes>(Simd<T, N, 0>(), v);
3110 }
3111 
3112 // 0x01..0F, kBytes = 1 => 0x0001..0E
3113 template <int kBytes, typename T, size_t N>
3115  return detail::ShiftRightBytesT < kBytes >= N * sizeof(T) ? 0xFF
3116  : kBytes > ()(v);
3117 }
3118 
3119 template <int kLanes, typename T, size_t N>
3121  const Repartition<uint8_t, decltype(d)> d8;
3122  return BitCast(d, ShiftRightBytes<kLanes * sizeof(T)>(d8, BitCast(d8, v)));
3123 }
3124 
3125 // Calls ShiftLeftBytes
3126 template <int kBytes, typename T, size_t N, HWY_IF_LE32(T, N)>
3128  Vec128<T, N> lo) {
3129  constexpr size_t kSize = N * sizeof(T);
3130  static_assert(0 < kBytes && kBytes < kSize, "kBytes invalid");
3131  const Repartition<uint8_t, decltype(d)> d8;
3132  const Full64<uint8_t> d_full8;
3133  const Repartition<T, decltype(d_full8)> d_full;
3134  using V64 = VFromD<decltype(d_full8)>;
3135  const V64 hi64(BitCast(d8, hi).raw);
3136  // Move into most-significant bytes
3137  const V64 lo64 = ShiftLeftBytes<8 - kSize>(V64(BitCast(d8, lo).raw));
3138  const V64 r = CombineShiftRightBytes<8 - kSize + kBytes>(d_full8, hi64, lo64);
3139  // After casting to full 64-bit vector of correct type, shrink to 32-bit
3140  return Vec128<T, N>(BitCast(d_full, r).raw);
3141 }
3142 
3143 // ------------------------------ UpperHalf (ShiftRightBytes)
3144 
3145 // Full input
3147  const Vec128<uint8_t> v) {
3148  return Vec64<uint8_t>(vget_high_u8(v.raw));
3149 }
3151  const Vec128<uint16_t> v) {
3152  return Vec64<uint16_t>(vget_high_u16(v.raw));
3153 }
3155  const Vec128<uint32_t> v) {
3156  return Vec64<uint32_t>(vget_high_u32(v.raw));
3157 }
3159  const Vec128<uint64_t> v) {
3160  return Vec64<uint64_t>(vget_high_u64(v.raw));
3161 }
3163  const Vec128<int8_t> v) {
3164  return Vec64<int8_t>(vget_high_s8(v.raw));
3165 }
3167  const Vec128<int16_t> v) {
3168  return Vec64<int16_t>(vget_high_s16(v.raw));
3169 }
3171  const Vec128<int32_t> v) {
3172  return Vec64<int32_t>(vget_high_s32(v.raw));
3173 }
3175  const Vec128<int64_t> v) {
3176  return Vec64<int64_t>(vget_high_s64(v.raw));
3177 }
3179  return Vec64<float>(vget_high_f32(v.raw));
3180 }
3181 #if HWY_ARCH_ARM_A64
3182 HWY_API Vec64<double> UpperHalf(Full64<double> /* tag */,
3183  const Vec128<double> v) {
3184  return Vec64<double>(vget_high_f64(v.raw));
3185 }
3186 #endif
3187 
3188 // Partial
3189 template <typename T, size_t N, HWY_IF_LE64(T, N)>
3190 HWY_API Vec128<T, (N + 1) / 2> UpperHalf(Half<Simd<T, N, 0>> /* tag */,
3191  Vec128<T, N> v) {
3192  const DFromV<decltype(v)> d;
3193  const RebindToUnsigned<decltype(d)> du;
3194  const auto vu = BitCast(du, v);
3195  const auto upper = BitCast(d, ShiftRightBytes<N * sizeof(T) / 2>(du, vu));
3196  return Vec128<T, (N + 1) / 2>(upper.raw);
3197 }
3198 
3199 // ------------------------------ Broadcast/splat any lane
3200 
3201 #if HWY_ARCH_ARM_A64
3202 // Unsigned
3203 template <int kLane>
3204 HWY_API Vec128<uint16_t> Broadcast(const Vec128<uint16_t> v) {
3205  static_assert(0 <= kLane && kLane < 8, "Invalid lane");
3206  return Vec128<uint16_t>(vdupq_laneq_u16(v.raw, kLane));
3207 }
3208 template <int kLane, size_t N, HWY_IF_LE64(uint16_t, N)>
3209 HWY_API Vec128<uint16_t, N> Broadcast(const Vec128<uint16_t, N> v) {
3210  static_assert(0 <= kLane && kLane < N, "Invalid lane");
3211  return Vec128<uint16_t, N>(vdup_lane_u16(v.raw, kLane));
3212 }
3213 template <int kLane>
3214 HWY_API Vec128<uint32_t> Broadcast(const Vec128<uint32_t> v) {
3215  static_assert(0 <= kLane && kLane < 4, "Invalid lane");
3216  return Vec128<uint32_t>(vdupq_laneq_u32(v.raw, kLane));
3217 }
3218 template <int kLane, size_t N, HWY_IF_LE64(uint32_t, N)>
3219 HWY_API Vec128<uint32_t, N> Broadcast(const Vec128<uint32_t, N> v) {
3220  static_assert(0 <= kLane && kLane < N, "Invalid lane");
3221  return Vec128<uint32_t, N>(vdup_lane_u32(v.raw, kLane));
3222 }
3223 template <int kLane>
3224 HWY_API Vec128<uint64_t> Broadcast(const Vec128<uint64_t> v) {
3225  static_assert(0 <= kLane && kLane < 2, "Invalid lane");
3226  return Vec128<uint64_t>(vdupq_laneq_u64(v.raw, kLane));
3227 }
3228 // Vec64<uint64_t> is defined below.
3229 
3230 // Signed
3231 template <int kLane>
3232 HWY_API Vec128<int16_t> Broadcast(const Vec128<int16_t> v) {
3233  static_assert(0 <= kLane && kLane < 8, "Invalid lane");
3234  return Vec128<int16_t>(vdupq_laneq_s16(v.raw, kLane));
3235 }
3236 template <int kLane, size_t N, HWY_IF_LE64(int16_t, N)>
3237 HWY_API Vec128<int16_t, N> Broadcast(const Vec128<int16_t, N> v) {
3238  static_assert(0 <= kLane && kLane < N, "Invalid lane");
3239  return Vec128<int16_t, N>(vdup_lane_s16(v.raw, kLane));
3240 }
3241 template <int kLane>
3242 HWY_API Vec128<int32_t> Broadcast(const Vec128<int32_t> v) {
3243  static_assert(0 <= kLane && kLane < 4, "Invalid lane");
3244  return Vec128<int32_t>(vdupq_laneq_s32(v.raw, kLane));
3245 }
3246 template <int kLane, size_t N, HWY_IF_LE64(int32_t, N)>
3247 HWY_API Vec128<int32_t, N> Broadcast(const Vec128<int32_t, N> v) {
3248  static_assert(0 <= kLane && kLane < N, "Invalid lane");
3249  return Vec128<int32_t, N>(vdup_lane_s32(v.raw, kLane));
3250 }
3251 template <int kLane>
3252 HWY_API Vec128<int64_t> Broadcast(const Vec128<int64_t> v) {
3253  static_assert(0 <= kLane && kLane < 2, "Invalid lane");
3254  return Vec128<int64_t>(vdupq_laneq_s64(v.raw, kLane));
3255 }
3256 // Vec64<int64_t> is defined below.
3257 
3258 // Float
3259 template <int kLane>
3260 HWY_API Vec128<float> Broadcast(const Vec128<float> v) {
3261  static_assert(0 <= kLane && kLane < 4, "Invalid lane");
3262  return Vec128<float>(vdupq_laneq_f32(v.raw, kLane));
3263 }
3264 template <int kLane, size_t N, HWY_IF_LE64(float, N)>
3265 HWY_API Vec128<float, N> Broadcast(const Vec128<float, N> v) {
3266  static_assert(0 <= kLane && kLane < N, "Invalid lane");
3267  return Vec128<float, N>(vdup_lane_f32(v.raw, kLane));
3268 }
3269 template <int kLane>
3270 HWY_API Vec128<double> Broadcast(const Vec128<double> v) {
3271  static_assert(0 <= kLane && kLane < 2, "Invalid lane");
3272  return Vec128<double>(vdupq_laneq_f64(v.raw, kLane));
3273 }
3274 template <int kLane>
3275 HWY_API Vec64<double> Broadcast(const Vec64<double> v) {
3276  static_assert(0 <= kLane && kLane < 1, "Invalid lane");
3277  return v;
3278 }
3279 
3280 #else
3281 // No vdupq_laneq_* on armv7: use vgetq_lane_* + vdupq_n_*.
3282 
3283 // Unsigned
3284 template <int kLane>
3286  static_assert(0 <= kLane && kLane < 8, "Invalid lane");
3287  return Vec128<uint16_t>(vdupq_n_u16(vgetq_lane_u16(v.raw, kLane)));
3288 }
3289 template <int kLane, size_t N, HWY_IF_LE64(uint16_t, N)>
3291  static_assert(0 <= kLane && kLane < N, "Invalid lane");
3292  return Vec128<uint16_t, N>(vdup_lane_u16(v.raw, kLane));
3293 }
3294 template <int kLane>
3296  static_assert(0 <= kLane && kLane < 4, "Invalid lane");
3297  return Vec128<uint32_t>(vdupq_n_u32(vgetq_lane_u32(v.raw, kLane)));
3298 }
3299 template <int kLane, size_t N, HWY_IF_LE64(uint32_t, N)>
3301  static_assert(0 <= kLane && kLane < N, "Invalid lane");
3302  return Vec128<uint32_t, N>(vdup_lane_u32(v.raw, kLane));
3303 }
3304 template <int kLane>
3306  static_assert(0 <= kLane && kLane < 2, "Invalid lane");
3307  return Vec128<uint64_t>(vdupq_n_u64(vgetq_lane_u64(v.raw, kLane)));
3308 }
3309 // Vec64<uint64_t> is defined below.
3310 
3311 // Signed
3312 template <int kLane>
3314  static_assert(0 <= kLane && kLane < 8, "Invalid lane");
3315  return Vec128<int16_t>(vdupq_n_s16(vgetq_lane_s16(v.raw, kLane)));
3316 }
3317 template <int kLane, size_t N, HWY_IF_LE64(int16_t, N)>
3319  static_assert(0 <= kLane && kLane < N, "Invalid lane");
3320  return Vec128<int16_t, N>(vdup_lane_s16(v.raw, kLane));
3321 }
3322 template <int kLane>
3324  static_assert(0 <= kLane && kLane < 4, "Invalid lane");
3325  return Vec128<int32_t>(vdupq_n_s32(vgetq_lane_s32(v.raw, kLane)));
3326 }
3327 template <int kLane, size_t N, HWY_IF_LE64(int32_t, N)>
3329  static_assert(0 <= kLane && kLane < N, "Invalid lane");
3330  return Vec128<int32_t, N>(vdup_lane_s32(v.raw, kLane));
3331 }
3332 template <int kLane>
3334  static_assert(0 <= kLane && kLane < 2, "Invalid lane");
3335  return Vec128<int64_t>(vdupq_n_s64(vgetq_lane_s64(v.raw, kLane)));
3336 }
3337 // Vec64<int64_t> is defined below.
3338 
3339 // Float
3340 template <int kLane>
3342  static_assert(0 <= kLane && kLane < 4, "Invalid lane");
3343  return Vec128<float>(vdupq_n_f32(vgetq_lane_f32(v.raw, kLane)));
3344 }
3345 template <int kLane, size_t N, HWY_IF_LE64(float, N)>
3347  static_assert(0 <= kLane && kLane < N, "Invalid lane");
3348  return Vec128<float, N>(vdup_lane_f32(v.raw, kLane));
3349 }
3350 
3351 #endif
3352 
3353 template <int kLane>
3355  static_assert(0 <= kLane && kLane < 1, "Invalid lane");
3356  return v;
3357 }
3358 template <int kLane>
3360  static_assert(0 <= kLane && kLane < 1, "Invalid lane");
3361  return v;
3362 }
3363 
3364 // ------------------------------ TableLookupLanes
3365 
3366 // Returned by SetTableIndices for use by TableLookupLanes.
3367 template <typename T, size_t N>
3368 struct Indices128 {
3370 };
3371 
3372 template <typename T, size_t N, typename TI, HWY_IF_LE128(T, N)>
3374  static_assert(sizeof(T) == sizeof(TI), "Index size must match lane");
3375 #if HWY_IS_DEBUG_BUILD
3376  const Rebind<TI, decltype(d)> di;
3377  HWY_DASSERT(AllFalse(di, Lt(vec, Zero(di))) &&
3378  AllTrue(di, Lt(vec, Set(di, static_cast<TI>(N)))));
3379 #endif
3380 
3381  const Repartition<uint8_t, decltype(d)> d8;
3382  using V8 = VFromD<decltype(d8)>;
3383  const Repartition<uint16_t, decltype(d)> d16;
3384 
3385  // Broadcast each lane index to all bytes of T and shift to bytes
3386  static_assert(sizeof(T) == 4 || sizeof(T) == 8, "");
3387  if (sizeof(T) == 4) {
3388  alignas(16) constexpr uint8_t kBroadcastLaneBytes[16] = {
3389  0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12};
3390  const V8 lane_indices =
3391  TableLookupBytes(BitCast(d8, vec), Load(d8, kBroadcastLaneBytes));
3392  const V8 byte_indices =
3393  BitCast(d8, ShiftLeft<2>(BitCast(d16, lane_indices)));
3394  alignas(16) constexpr uint8_t kByteOffsets[16] = {0, 1, 2, 3, 0, 1, 2, 3,
3395  0, 1, 2, 3, 0, 1, 2, 3};
3396  const V8 sum = Add(byte_indices, Load(d8, kByteOffsets));
3397  return Indices128<T, N>{BitCast(d, sum).raw};
3398  } else {
3399  alignas(16) constexpr uint8_t kBroadcastLaneBytes[16] = {
3400  0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8};
3401  const V8 lane_indices =
3402  TableLookupBytes(BitCast(d8, vec), Load(d8, kBroadcastLaneBytes));
3403  const V8 byte_indices =
3404  BitCast(d8, ShiftLeft<3>(BitCast(d16, lane_indices)));
3405  alignas(16) constexpr uint8_t kByteOffsets[16] = {0, 1, 2, 3, 4, 5, 6, 7,
3406  0, 1, 2, 3, 4, 5, 6, 7};
3407  const V8 sum = Add(byte_indices, Load(d8, kByteOffsets));
3408  return Indices128<T, N>{BitCast(d, sum).raw};
3409  }
3410 }
3411 
3412 template <typename T, size_t N, typename TI, HWY_IF_LE128(T, N)>
3414  const Rebind<TI, decltype(d)> di;
3415  return IndicesFromVec(d, LoadU(di, idx));
3416 }
3417 
3418 template <typename T, size_t N>
3420  const DFromV<decltype(v)> d;
3421  const RebindToSigned<decltype(d)> di;
3422  return BitCast(
3423  d, TableLookupBytes(BitCast(di, v), BitCast(di, Vec128<T, N>{idx.raw})));
3424 }
3425 
3426 // ------------------------------ Reverse (Shuffle0123, Shuffle2301, Shuffle01)
3427 
3428 // Single lane: no change
3429 template <typename T>
3431  return v;
3432 }
3433 
3434 // Two lanes: shuffle
3435 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
3437  return Vec128<T, 2>(Shuffle2301(v));
3438 }
3439 
3440 template <typename T, HWY_IF_LANE_SIZE(T, 8)>
3442  return Shuffle01(v);
3443 }
3444 
3445 // Four lanes: shuffle
3446 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
3447 HWY_API Vec128<T> Reverse(Full128<T> /* tag */, const Vec128<T> v) {
3448  return Shuffle0123(v);
3449 }
3450 
3451 // 16-bit
3452 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
3454  const RepartitionToWide<RebindToUnsigned<decltype(d)>> du32;
3455  return BitCast(d, RotateRight<16>(Reverse(du32, BitCast(du32, v))));
3456 }
3457 
3458 // ------------------------------ Reverse2
3459 
3460 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2), HWY_IF_LE64(T, N)>
3462  const RebindToUnsigned<decltype(d)> du;
3463  return BitCast(d, Vec128<uint16_t, N>(vrev32_u16(BitCast(du, v).raw)));
3464 }
3465 template <typename T, HWY_IF_LANE_SIZE(T, 2)>
3467  const RebindToUnsigned<decltype(d)> du;
3468  return BitCast(d, Vec128<uint16_t>(vrev32q_u16(BitCast(du, v).raw)));
3469 }
3470 
3471 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4), HWY_IF_LE64(T, N)>
3472 HWY_API Vec128<T, N> Reverse2(Simd<T, N, 0> d, const Vec128<T, N> v) {
3473  const RebindToUnsigned<decltype(d)> du;
3474  return BitCast(d, Vec128<uint32_t, N>(vrev64_u32(BitCast(du, v).raw)));
3475 }
3476 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
3477 HWY_API Vec128<T> Reverse2(Full128<T> d, const Vec128<T> v) {
3478  const RebindToUnsigned<decltype(d)> du;
3479  return BitCast(d, Vec128<uint32_t>(vrev64q_u32(BitCast(du, v).raw)));
3480 }
3481 
3482 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
3484  return Shuffle01(v);
3485 }
3486 
3487 // ------------------------------ Reverse4
3488 
3489 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2), HWY_IF_LE64(T, N)>
3491  const RebindToUnsigned<decltype(d)> du;
3492  return BitCast(d, Vec128<uint16_t, N>(vrev64_u16(BitCast(du, v).raw)));
3493 }
3494 template <typename T, HWY_IF_LANE_SIZE(T, 2)>
3496  const RebindToUnsigned<decltype(d)> du;
3497  return BitCast(d, Vec128<uint16_t>(vrev64q_u16(BitCast(du, v).raw)));
3498 }
3499 
3500 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
3502  return Shuffle0123(v);
3503 }
3504 
3505 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
3506 HWY_API Vec128<T, N> Reverse4(Simd<T, N, 0> /* tag */, const Vec128<T, N>) {
3507  HWY_ASSERT(0); // don't have 8 u64 lanes
3508 }
3509 
3510 // ------------------------------ Reverse8
3511 
3512 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
3514  return Reverse(d, v);
3515 }
3516 
3517 template <typename T, size_t N, HWY_IF_NOT_LANE_SIZE(T, 2)>
3518 HWY_API Vec128<T, N> Reverse8(Simd<T, N, 0>, const Vec128<T, N>) {
3519  HWY_ASSERT(0); // don't have 8 lanes unless 16-bit
3520 }
3521 
3522 // ------------------------------ Other shuffles (TableLookupBytes)
3523 
3524 // Notation: let Vec128<int32_t> have lanes 3,2,1,0 (0 is least-significant).
3525 // Shuffle0321 rotates one lane to the right (the previous least-significant
3526 // lane is now most-significant). These could also be implemented via
3527 // CombineShiftRightBytes but the shuffle_abcd notation is more convenient.
3528 
3529 // Swap 64-bit halves
3530 template <typename T>
3532  return CombineShiftRightBytes<8>(Full128<T>(), v, v);
3533 }
3534 template <typename T>
3536  return CombineShiftRightBytes<8>(Full128<T>(), v, v);
3537 }
3538 
3539 // Rotate right 32 bits
3540 template <typename T>
3542  return CombineShiftRightBytes<4>(Full128<T>(), v, v);
3543 }
3544 
3545 // Rotate left 32 bits
3546 template <typename T>
3548  return CombineShiftRightBytes<12>(Full128<T>(), v, v);
3549 }
3550 
3551 // Reverse
3552 template <typename T>
3554  return Shuffle2301(Shuffle1032(v));
3555 }
3556 
3557 // ------------------------------ InterleaveLower
3558 
3559 // Interleaves lanes from halves of the 128-bit blocks of "a" (which provides
3560 // the least-significant lane) and "b". To concatenate two half-width integers
3561 // into one, use ZipLower/Upper instead (also works with scalar).
3564 
3565 #if HWY_ARCH_ARM_A64
3566 // N=1 makes no sense (in that case, there would be no upper/lower).
3567 HWY_API Vec128<uint64_t> InterleaveLower(const Vec128<uint64_t> a,
3568  const Vec128<uint64_t> b) {
3569  return Vec128<uint64_t>(vzip1q_u64(a.raw, b.raw));
3570 }
3571 HWY_API Vec128<int64_t> InterleaveLower(const Vec128<int64_t> a,
3572  const Vec128<int64_t> b) {
3573  return Vec128<int64_t>(vzip1q_s64(a.raw, b.raw));
3574 }
3575 HWY_API Vec128<double> InterleaveLower(const Vec128<double> a,
3576  const Vec128<double> b) {
3577  return Vec128<double>(vzip1q_f64(a.raw, b.raw));
3578 }
3579 #else
3580 // ARMv7 emulation.
3582  const Vec128<uint64_t> b) {
3583  return CombineShiftRightBytes<8>(Full128<uint64_t>(), b, Shuffle01(a));
3584 }
3586  const Vec128<int64_t> b) {
3587  return CombineShiftRightBytes<8>(Full128<int64_t>(), b, Shuffle01(a));
3588 }
3589 #endif
3590 
3591 // Floats
3593  const Vec128<float> b) {
3594  return Vec128<float>(vzip1q_f32(a.raw, b.raw));
3595 }
3596 template <size_t N, HWY_IF_LE64(float, N)>
3598  const Vec128<float, N> b) {
3599  return Vec128<float, N>(vzip1_f32(a.raw, b.raw));
3600 }
3601 
3602 // < 64 bit parts
3603 template <typename T, size_t N, HWY_IF_LE32(T, N)>
3605  return Vec128<T, N>(InterleaveLower(Vec64<T>(a.raw), Vec64<T>(b.raw)).raw);
3606 }
3607 
3608 // Additional overload for the optional Simd<> tag.
3609 template <typename T, size_t N, class V = Vec128<T, N>>
3610 HWY_API V InterleaveLower(Simd<T, N, 0> /* tag */, V a, V b) {
3611  return InterleaveLower(a, b);
3612 }
3613 
3614 // ------------------------------ InterleaveUpper (UpperHalf)
3615 
3616 // All functions inside detail lack the required D parameter.
3617 namespace detail {
3620 
3621 #if HWY_ARCH_ARM_A64
3622 // N=1 makes no sense (in that case, there would be no upper/lower).
3623 HWY_API Vec128<uint64_t> InterleaveUpper(const Vec128<uint64_t> a,
3624  const Vec128<uint64_t> b) {
3625  return Vec128<uint64_t>(vzip2q_u64(a.raw, b.raw));
3626 }
3627 HWY_API Vec128<int64_t> InterleaveUpper(Vec128<int64_t> a, Vec128<int64_t> b) {
3628  return Vec128<int64_t>(vzip2q_s64(a.raw, b.raw));
3629 }
3630 HWY_API Vec128<double> InterleaveUpper(Vec128<double> a, Vec128<double> b) {
3631  return Vec128<double>(vzip2q_f64(a.raw, b.raw));
3632 }
3633 #else
3634 // ARMv7 emulation.
3636  const Vec128<uint64_t> b) {
3637  return CombineShiftRightBytes<8>(Full128<uint64_t>(), Shuffle01(b), a);
3638 }
3640  return CombineShiftRightBytes<8>(Full128<int64_t>(), Shuffle01(b), a);
3641 }
3642 #endif
3643 
3645  return Vec128<float>(vzip2q_f32(a.raw, b.raw));
3646 }
3648  const Vec64<float> b) {
3649  return Vec64<float>(vzip2_f32(a.raw, b.raw));
3650 }
3651 
3652 } // namespace detail
3653 
3654 // Full register
3655 template <typename T, size_t N, HWY_IF_GE64(T, N), class V = Vec128<T, N>>
3656 HWY_API V InterleaveUpper(Simd<T, N, 0> /* tag */, V a, V b) {
3657  return detail::InterleaveUpper(a, b);
3658 }
3659 
3660 // Partial
3661 template <typename T, size_t N, HWY_IF_LE32(T, N), class V = Vec128<T, N>>
3662 HWY_API V InterleaveUpper(Simd<T, N, 0> d, V a, V b) {
3663  const Half<decltype(d)> d2;
3664  return InterleaveLower(d, V(UpperHalf(d2, a).raw), V(UpperHalf(d2, b).raw));
3665 }
3666 
3667 // ------------------------------ ZipLower/ZipUpper (InterleaveLower)
3668 
3669 // Same as Interleave*, except that the return lanes are double-width integers;
3670 // this is necessary because the single-lane scalar cannot return two values.
3671 template <class V, class DW = RepartitionToWide<DFromV<V>>>
3673  return BitCast(DW(), InterleaveLower(a, b));
3674 }
3675 template <class V, class D = DFromV<V>, class DW = RepartitionToWide<D>>
3676 HWY_API VFromD<DW> ZipLower(DW dw, V a, V b) {
3677  return BitCast(dw, InterleaveLower(D(), a, b));
3678 }
3679 
3680 template <class V, class D = DFromV<V>, class DW = RepartitionToWide<D>>
3681 HWY_API VFromD<DW> ZipUpper(DW dw, V a, V b) {
3682  return BitCast(dw, InterleaveUpper(D(), a, b));
3683 }
3684 
3685 // ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower)
3686 
3687 template <size_t N>
3691  const Vec128<float, N> sum0,
3692  Vec128<float, N>& sum1) {
3693  const Repartition<uint16_t, decltype(df32)> du16;
3694  const RebindToUnsigned<decltype(df32)> du32;
3695  const Vec128<uint16_t, 2 * N> zero = Zero(du16);
3696  const Vec128<uint32_t, N> a0 = ZipLower(du32, zero, BitCast(du16, a));
3697  const Vec128<uint32_t, N> a1 = ZipUpper(du32, zero, BitCast(du16, a));
3698  const Vec128<uint32_t, N> b0 = ZipLower(du32, zero, BitCast(du16, b));
3699  const Vec128<uint32_t, N> b1 = ZipUpper(du32, zero, BitCast(du16, b));
3700  sum1 = MulAdd(BitCast(df32, a1), BitCast(df32, b1), sum1);
3701  return MulAdd(BitCast(df32, a0), BitCast(df32, b0), sum0);
3702 }
3703 
3704 // ================================================== COMBINE
3705 
3706 // ------------------------------ Combine (InterleaveLower)
3707 
3708 // Full result
3710  Vec64<uint8_t> lo) {
3711  return Vec128<uint8_t>(vcombine_u8(lo.raw, hi.raw));
3712 }
3715  return Vec128<uint16_t>(vcombine_u16(lo.raw, hi.raw));
3716 }
3719  return Vec128<uint32_t>(vcombine_u32(lo.raw, hi.raw));
3720 }
3723  return Vec128<uint64_t>(vcombine_u64(lo.raw, hi.raw));
3724 }
3725 
3727  Vec64<int8_t> lo) {
3728  return Vec128<int8_t>(vcombine_s8(lo.raw, hi.raw));
3729 }
3731  Vec64<int16_t> lo) {
3732  return Vec128<int16_t>(vcombine_s16(lo.raw, hi.raw));
3733 }
3735  Vec64<int32_t> lo) {
3736  return Vec128<int32_t>(vcombine_s32(lo.raw, hi.raw));
3737 }
3739  Vec64<int64_t> lo) {
3740  return Vec128<int64_t>(vcombine_s64(lo.raw, hi.raw));
3741 }
3742 
3744  Vec64<float> lo) {
3745  return Vec128<float>(vcombine_f32(lo.raw, hi.raw));
3746 }
3747 #if HWY_ARCH_ARM_A64
3748 HWY_API Vec128<double> Combine(Full128<double> /* tag */, Vec64<double> hi,
3749  Vec64<double> lo) {
3750  return Vec128<double>(vcombine_f64(lo.raw, hi.raw));
3751 }
3752 #endif
3753 
3754 // < 64bit input, <= 64 bit result
3755 template <typename T, size_t N, HWY_IF_LE64(T, N)>
3757  Vec128<T, N / 2> lo) {
3758  // First double N (only lower halves will be used).
3759  const Vec128<T, N> hi2(hi.raw);
3760  const Vec128<T, N> lo2(lo.raw);
3761  // Repartition to two unsigned lanes (each the size of the valid input).
3762  const Simd<UnsignedFromSize<N * sizeof(T) / 2>, 2, 0> du;
3763  return BitCast(d, InterleaveLower(BitCast(du, lo2), BitCast(du, hi2)));
3764 }
3765 
3766 // ------------------------------ ZeroExtendVector (Combine)
3767 
3768 template <typename T, size_t N>
3770  return Combine(d, Zero(Half<decltype(d)>()), lo);
3771 }
3772 
3773 // ------------------------------ ConcatLowerLower
3774 
3775 // 64 or 128-bit input: just interleave
3776 template <typename T, size_t N, HWY_IF_GE64(T, N)>
3778  Vec128<T, N> lo) {
3779  // Treat half-width input as a single lane and interleave them.
3780  const Repartition<UnsignedFromSize<N * sizeof(T) / 2>, decltype(d)> du;
3781  return BitCast(d, InterleaveLower(BitCast(du, lo), BitCast(du, hi)));
3782 }
3783 
3784 namespace detail {
3785 #if HWY_ARCH_ARM_A64
3786 HWY_NEON_DEF_FUNCTION_UINT_8_16_32(InterleaveEven, vtrn1, _, 2)
3787 HWY_NEON_DEF_FUNCTION_INT_8_16_32(InterleaveEven, vtrn1, _, 2)
3788 HWY_NEON_DEF_FUNCTION_FLOAT_32(InterleaveEven, vtrn1, _, 2)
3789 HWY_NEON_DEF_FUNCTION_UINT_8_16_32(InterleaveOdd, vtrn2, _, 2)
3790 HWY_NEON_DEF_FUNCTION_INT_8_16_32(InterleaveOdd, vtrn2, _, 2)
3791 HWY_NEON_DEF_FUNCTION_FLOAT_32(InterleaveOdd, vtrn2, _, 2)
3792 #else
3793 
3794 // vtrn returns a struct with even and odd result.
3795 #define HWY_NEON_BUILD_TPL_HWY_TRN
3796 #define HWY_NEON_BUILD_RET_HWY_TRN(type, size) type##x##size##x2_t
3797 // Pass raw args so we can accept uint16x2 args, for which there is no
3798 // corresponding uint16x2x2 return type.
3799 #define HWY_NEON_BUILD_PARAM_HWY_TRN(TYPE, size) \
3800  Raw128<TYPE##_t, size>::type a, Raw128<TYPE##_t, size>::type b
3801 #define HWY_NEON_BUILD_ARG_HWY_TRN a, b
3802 
3803 // Cannot use UINT8 etc. type macros because the x2_t tuples are only defined
3804 // for full and half vectors.
3805 HWY_NEON_DEF_FUNCTION(uint8, 16, InterleaveEvenOdd, vtrnq, _, u8, HWY_TRN)
3806 HWY_NEON_DEF_FUNCTION(uint8, 8, InterleaveEvenOdd, vtrn, _, u8, HWY_TRN)
3807 HWY_NEON_DEF_FUNCTION(uint16, 8, InterleaveEvenOdd, vtrnq, _, u16, HWY_TRN)
3808 HWY_NEON_DEF_FUNCTION(uint16, 4, InterleaveEvenOdd, vtrn, _, u16, HWY_TRN)
3809 HWY_NEON_DEF_FUNCTION(uint32, 4, InterleaveEvenOdd, vtrnq, _, u32, HWY_TRN)
3810 HWY_NEON_DEF_FUNCTION(uint32, 2, InterleaveEvenOdd, vtrn, _, u32, HWY_TRN)
3811 HWY_NEON_DEF_FUNCTION(int8, 16, InterleaveEvenOdd, vtrnq, _, s8, HWY_TRN)
3812 HWY_NEON_DEF_FUNCTION(int8, 8, InterleaveEvenOdd, vtrn, _, s8, HWY_TRN)
3813 HWY_NEON_DEF_FUNCTION(int16, 8, InterleaveEvenOdd, vtrnq, _, s16, HWY_TRN)
3814 HWY_NEON_DEF_FUNCTION(int16, 4, InterleaveEvenOdd, vtrn, _, s16, HWY_TRN)
3815 HWY_NEON_DEF_FUNCTION(int32, 4, InterleaveEvenOdd, vtrnq, _, s32, HWY_TRN)
3816 HWY_NEON_DEF_FUNCTION(int32, 2, InterleaveEvenOdd, vtrn, _, s32, HWY_TRN)
3817 HWY_NEON_DEF_FUNCTION(float32, 4, InterleaveEvenOdd, vtrnq, _, f32, HWY_TRN)
3818 HWY_NEON_DEF_FUNCTION(float32, 2, InterleaveEvenOdd, vtrn, _, f32, HWY_TRN)
3819 #endif
3820 } // namespace detail
3821 
3822 // <= 32-bit input/output
3823 template <typename T, size_t N, HWY_IF_LE32(T, N)>
3824 HWY_API Vec128<T, N> ConcatLowerLower(const Simd<T, N, 0> d, Vec128<T, N> hi,
3825  Vec128<T, N> lo) {
3826  // Treat half-width input as two lanes and take every second one.
3827  const Repartition<UnsignedFromSize<N * sizeof(T) / 2>, decltype(d)> du;
3828 #if HWY_ARCH_ARM_A64
3829  return BitCast(d, detail::InterleaveEven(BitCast(du, lo), BitCast(du, hi)));
3830 #else
3831  using VU = VFromD<decltype(du)>;
3832  return BitCast(
3833  d, VU(detail::InterleaveEvenOdd(BitCast(du, lo).raw, BitCast(du, hi).raw)
3834  .val[0]));
3835 #endif
3836 }
3837 
3838 // ------------------------------ ConcatUpperUpper
3839 
3840 // 64 or 128-bit input: just interleave
3841 template <typename T, size_t N, HWY_IF_GE64(T, N)>
3843  Vec128<T, N> lo) {
3844  // Treat half-width input as a single lane and interleave them.
3845  const Repartition<UnsignedFromSize<N * sizeof(T) / 2>, decltype(d)> du;
3846  return BitCast(d, InterleaveUpper(du, BitCast(du, lo), BitCast(du, hi)));
3847 }
3848 
3849 // <= 32-bit input/output
3850 template <typename T, size_t N, HWY_IF_LE32(T, N)>
3851 HWY_API Vec128<T, N> ConcatUpperUpper(const Simd<T, N, 0> d, Vec128<T, N> hi,
3852  Vec128<T, N> lo) {
3853  // Treat half-width input as two lanes and take every second one.
3854  const Repartition<UnsignedFromSize<N * sizeof(T) / 2>, decltype(d)> du;
3855 #if HWY_ARCH_ARM_A64
3856  return BitCast(d, detail::InterleaveOdd(BitCast(du, lo), BitCast(du, hi)));
3857 #else
3858  using VU = VFromD<decltype(du)>;
3859  return BitCast(
3860  d, VU(detail::InterleaveEvenOdd(BitCast(du, lo).raw, BitCast(du, hi).raw)
3861  .val[1]));
3862 #endif
3863 }
3864 
3865 // ------------------------------ ConcatLowerUpper (ShiftLeftBytes)
3866 
3867 // 64 or 128-bit input: extract from concatenated
3868 template <typename T, size_t N, HWY_IF_GE64(T, N)>
3870  Vec128<T, N> lo) {
3871  return CombineShiftRightBytes<N * sizeof(T) / 2>(d, hi, lo);
3872 }
3873 
3874 // <= 32-bit input/output
3875 template <typename T, size_t N, HWY_IF_LE32(T, N)>
3876 HWY_API Vec128<T, N> ConcatLowerUpper(const Simd<T, N, 0> d, Vec128<T, N> hi,
3877  Vec128<T, N> lo) {
3878  constexpr size_t kSize = N * sizeof(T);
3879  const Repartition<uint8_t, decltype(d)> d8;
3880  const Full64<uint8_t> d8x8;
3881  const Full64<T> d64;
3882  using V8x8 = VFromD<decltype(d8x8)>;
3883  const V8x8 hi8x8(BitCast(d8, hi).raw);
3884  // Move into most-significant bytes
3885  const V8x8 lo8x8 = ShiftLeftBytes<8 - kSize>(V8x8(BitCast(d8, lo).raw));
3886  const V8x8 r = CombineShiftRightBytes<8 - kSize / 2>(d8x8, hi8x8, lo8x8);
3887  // Back to original lane type, then shrink N.
3888  return Vec128<T, N>(BitCast(d64, r).raw);
3889 }
3890 
3891 // ------------------------------ ConcatUpperLower
3892 
3893 // Works for all N.
3894 template <typename T, size_t N>
3896  Vec128<T, N> lo) {
3897  return IfThenElse(FirstN(d, Lanes(d) / 2), lo, hi);
3898 }
3899 
3900 // ------------------------------ ConcatOdd (InterleaveUpper)
3901 
3902 // 32-bit full
3905  return Vec128<uint32_t>(vuzp2q_u32(lo.raw, hi.raw));
3906 }
3909  return Vec128<int32_t>(vuzp2q_s32(lo.raw, hi.raw));
3910 }
3912  Vec128<float> lo) {
3913  return Vec128<float>(vuzp2q_f32(lo.raw, hi.raw));
3914 }
3915 
3916 // 32-bit partial
3917 template <size_t N, HWY_IF_LE64(uint32_t, N)>
3920  Vec128<uint32_t, N> lo) {
3921  return Vec128<uint32_t, N>(vuzp2_u32(lo.raw, hi.raw));
3922 }
3923 template <size_t N, HWY_IF_LE64(int32_t, N)>
3925  Vec128<int32_t, N> hi,
3926  Vec128<int32_t, N> lo) {
3927  return Vec128<int32_t, N>(vuzp2_s32(lo.raw, hi.raw));
3928 }
3929 template <size_t N, HWY_IF_LE64(float, N)>
3932  return Vec128<float, N>(vuzp2_f32(lo.raw, hi.raw));
3933 }
3934 
3935 // 64-bit full - no partial because we need at least two inputs to have
3936 // even/odd. ARMv7 lacks vuzpq_u64, and it's anyway the same as InterleaveUpper.
3937 template <typename T, HWY_IF_LANE_SIZE(T, 8)>
3939  return InterleaveUpper(d, lo, hi);
3940 }
3941 
3942 // ------------------------------ ConcatEven (InterleaveLower)
3943 
3944 // 32-bit full
3947  return Vec128<uint32_t>(vuzp1q_u32(lo.raw, hi.raw));
3948 }
3951  return Vec128<int32_t>(vuzp1q_s32(lo.raw, hi.raw));
3952 }
3954  Vec128<float> lo) {
3955  return Vec128<float>(vuzp1q_f32(lo.raw, hi.raw));
3956 }
3957 
3958 // 32-bit partial
3959 template <size_t N, HWY_IF_LE64(uint32_t, N)>
3962  Vec128<uint32_t, N> lo) {
3963  return Vec128<uint32_t, N>(vuzp1_u32(lo.raw, hi.raw));
3964 }
3965 template <size_t N, HWY_IF_LE64(int32_t, N)>
3967  Vec128<int32_t, N> hi,
3968  Vec128<int32_t, N> lo) {
3969  return Vec128<int32_t, N>(vuzp1_s32(lo.raw, hi.raw));
3970 }
3971 template <size_t N, HWY_IF_LE64(float, N)>
3974  return Vec128<float, N>(vuzp1_f32(lo.raw, hi.raw));
3975 }
3976 
3977 // 64-bit full - no partial because we need at least two inputs to have
3978 // even/odd. ARMv7 lacks vuzpq_u64, and it's anyway the same as InterleaveUpper.
3979 template <typename T, HWY_IF_LANE_SIZE(T, 8)>
3981  return InterleaveLower(d, lo, hi);
3982 }
3983 
3984 // ------------------------------ DupEven (InterleaveLower)
3985 
3986 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
3988 #if HWY_ARCH_ARM_A64
3989  return detail::InterleaveEven(v, v);
3990 #else
3991  return Vec128<T, N>(detail::InterleaveEvenOdd(v.raw, v.raw).val[0]);
3992 #endif
3993 }
3994 
3995 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
3996 HWY_API Vec128<T, N> DupEven(const Vec128<T, N> v) {
3997  return InterleaveLower(Simd<T, N, 0>(), v, v);
3998 }
3999 
4000 // ------------------------------ DupOdd (InterleaveUpper)
4001 
4002 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
4004 #if HWY_ARCH_ARM_A64
4005  return detail::InterleaveOdd(v, v);
4006 #else
4007  return Vec128<T, N>(detail::InterleaveEvenOdd(v.raw, v.raw).val[1]);
4008 #endif
4009 }
4010 
4011 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
4012 HWY_API Vec128<T, N> DupOdd(const Vec128<T, N> v) {
4013  return InterleaveUpper(Simd<T, N, 0>(), v, v);
4014 }
4015 
4016 // ------------------------------ OddEven (IfThenElse)
4017 
4018 template <typename T, size_t N>
4020  const Simd<T, N, 0> d;
4021  const Repartition<uint8_t, decltype(d)> d8;
4022  alignas(16) constexpr uint8_t kBytes[16] = {
4023  ((0 / sizeof(T)) & 1) ? 0 : 0xFF, ((1 / sizeof(T)) & 1) ? 0 : 0xFF,
4024  ((2 / sizeof(T)) & 1) ? 0 : 0xFF, ((3 / sizeof(T)) & 1) ? 0 : 0xFF,
4025  ((4 / sizeof(T)) & 1) ? 0 : 0xFF, ((5 / sizeof(T)) & 1) ? 0 : 0xFF,
4026  ((6 / sizeof(T)) & 1) ? 0 : 0xFF, ((7 / sizeof(T)) & 1) ? 0 : 0xFF,
4027  ((8 / sizeof(T)) & 1) ? 0 : 0xFF, ((9 / sizeof(T)) & 1) ? 0 : 0xFF,
4028  ((10 / sizeof(T)) & 1) ? 0 : 0xFF, ((11 / sizeof(T)) & 1) ? 0 : 0xFF,
4029  ((12 / sizeof(T)) & 1) ? 0 : 0xFF, ((13 / sizeof(T)) & 1) ? 0 : 0xFF,
4030  ((14 / sizeof(T)) & 1) ? 0 : 0xFF, ((15 / sizeof(T)) & 1) ? 0 : 0xFF,
4031  };
4032  const auto vec = BitCast(d, Load(d8, kBytes));
4033  return IfThenElse(MaskFromVec(vec), b, a);
4034 }
4035 
4036 // ------------------------------ OddEvenBlocks
4037 template <typename T, size_t N>
4039  return even;
4040 }
4041 
4042 // ------------------------------ SwapAdjacentBlocks
4043 
4044 template <typename T, size_t N>
4046  return v;
4047 }
4048 
4049 // ------------------------------ ReverseBlocks
4050 
4051 // Single block: no change
4052 template <typename T>
4054  return v;
4055 }
4056 
4057 // ------------------------------ ReorderDemote2To (OddEven)
4058 
4059 template <size_t N>
4062  const RebindToUnsigned<decltype(dbf16)> du16;
4063  const Repartition<uint32_t, decltype(dbf16)> du32;
4064  const Vec128<uint32_t, N> b_in_even = ShiftRight<16>(BitCast(du32, b));
4065  return BitCast(dbf16, OddEven(BitCast(du16, a), BitCast(du16, b_in_even)));
4066 }
4067 
4068 // ================================================== CRYPTO
4069 
4070 #if defined(__ARM_FEATURE_AES)
4071 
4072 // Per-target flag to prevent generic_ops-inl.h from defining AESRound.
4073 #ifdef HWY_NATIVE_AES
4074 #undef HWY_NATIVE_AES
4075 #else
4076 #define HWY_NATIVE_AES
4077 #endif
4078 
4079 HWY_API Vec128<uint8_t> AESRound(Vec128<uint8_t> state,
4080  Vec128<uint8_t> round_key) {
4081  // NOTE: it is important that AESE and AESMC be consecutive instructions so
4082  // they can be fused. AESE includes AddRoundKey, which is a different ordering
4083  // than the AES-NI semantics we adopted, so XOR by 0 and later with the actual
4084  // round key (the compiler will hopefully optimize this for multiple rounds).
4085  return Vec128<uint8_t>(vaesmcq_u8(vaeseq_u8(state.raw, vdupq_n_u8(0)))) ^
4086  round_key;
4087 }
4088 
4089 HWY_API Vec128<uint8_t> AESLastRound(Vec128<uint8_t> state,
4090  Vec128<uint8_t> round_key) {
4091  return Vec128<uint8_t>(vaeseq_u8(state.raw, vdupq_n_u8(0))) ^ round_key;
4092 }
4093 
4094 HWY_API Vec128<uint64_t> CLMulLower(Vec128<uint64_t> a, Vec128<uint64_t> b) {
4095  return Vec128<uint64_t>((uint64x2_t)vmull_p64(GetLane(a), GetLane(b)));
4096 }
4097 
4098 HWY_API Vec128<uint64_t> CLMulUpper(Vec128<uint64_t> a, Vec128<uint64_t> b) {
4099  return Vec128<uint64_t>(
4100  (uint64x2_t)vmull_high_p64((poly64x2_t)a.raw, (poly64x2_t)b.raw));
4101 }
4102 
4103 #endif // __ARM_FEATURE_AES
4104 
4105 // ================================================== MISC
4106 
4107 template <size_t N>
4109  const Vec128<bfloat16_t, N> v) {
4110  const Rebind<uint16_t, decltype(df32)> du16;
4111  const RebindToSigned<decltype(df32)> di32;
4112  return BitCast(df32, ShiftLeft<16>(PromoteTo(di32, BitCast(du16, v))));
4113 }
4114 
4115 // ------------------------------ MulEven (ConcatEven)
4116 
4117 // Multiplies even lanes (0, 2 ..) and places the double-wide result into
4118 // even and the upper half into its odd neighbor lane.
4120  const Full128<int32_t> d;
4121  int32x4_t a_packed = ConcatEven(d, a, a).raw;
4122  int32x4_t b_packed = ConcatEven(d, b, b).raw;
4123  return Vec128<int64_t>(
4124  vmull_s32(vget_low_s32(a_packed), vget_low_s32(b_packed)));
4125 }
4127  const Full128<uint32_t> d;
4128  uint32x4_t a_packed = ConcatEven(d, a, a).raw;
4129  uint32x4_t b_packed = ConcatEven(d, b, b).raw;
4130  return Vec128<uint64_t>(
4131  vmull_u32(vget_low_u32(a_packed), vget_low_u32(b_packed)));
4132 }
4133 
4134 template <size_t N>
4135 HWY_API Vec128<int64_t, (N + 1) / 2> MulEven(const Vec128<int32_t, N> a,
4136  const Vec128<int32_t, N> b) {
4137  const DFromV<decltype(a)> d;
4138  int32x2_t a_packed = ConcatEven(d, a, a).raw;
4139  int32x2_t b_packed = ConcatEven(d, b, b).raw;
4140  return Vec128<int64_t, (N + 1) / 2>(
4141  vget_low_s64(vmull_s32(a_packed, b_packed)));
4142 }
4143 template <size_t N>
4144 HWY_API Vec128<uint64_t, (N + 1) / 2> MulEven(const Vec128<uint32_t, N> a,
4145  const Vec128<uint32_t, N> b) {
4146  const DFromV<decltype(a)> d;
4147  uint32x2_t a_packed = ConcatEven(d, a, a).raw;
4148  uint32x2_t b_packed = ConcatEven(d, b, b).raw;
4149  return Vec128<uint64_t, (N + 1) / 2>(
4150  vget_low_u64(vmull_u32(a_packed, b_packed)));
4151 }
4152 
4154  uint64_t hi;
4155  uint64_t lo = Mul128(vgetq_lane_u64(a.raw, 0), vgetq_lane_u64(b.raw, 0), &hi);
4156  return Vec128<uint64_t>(vsetq_lane_u64(hi, vdupq_n_u64(lo), 1));
4157 }
4158 
4160  uint64_t hi;
4161  uint64_t lo = Mul128(vgetq_lane_u64(a.raw, 1), vgetq_lane_u64(b.raw, 1), &hi);
4162  return Vec128<uint64_t>(vsetq_lane_u64(hi, vdupq_n_u64(lo), 1));
4163 }
4164 
4165 // ------------------------------ TableLookupBytes (Combine, LowerHalf)
4166 
4167 // Both full
4168 template <typename T, typename TI>
4170  const Vec128<TI> from) {
4171  const Full128<TI> d;
4172  const Repartition<uint8_t, decltype(d)> d8;
4173 #if HWY_ARCH_ARM_A64
4174  return BitCast(d, Vec128<uint8_t>(vqtbl1q_u8(BitCast(d8, bytes).raw,
4175  BitCast(d8, from).raw)));
4176 #else
4177  uint8x16_t table0 = BitCast(d8, bytes).raw;
4178  uint8x8x2_t table;
4179  table.val[0] = vget_low_u8(table0);
4180  table.val[1] = vget_high_u8(table0);
4181  uint8x16_t idx = BitCast(d8, from).raw;
4182  uint8x8_t low = vtbl2_u8(table, vget_low_u8(idx));
4183  uint8x8_t hi = vtbl2_u8(table, vget_high_u8(idx));
4184  return BitCast(d, Vec128<uint8_t>(vcombine_u8(low, hi)));
4185 #endif
4186 }
4187 
4188 // Partial index vector
4189 template <typename T, typename TI, size_t NI, HWY_IF_LE64(TI, NI)>
4191  const Vec128<TI, NI> from) {
4192  const Full128<TI> d_full;
4193  const Vec64<TI> from64(from.raw);
4194  const auto idx_full = Combine(d_full, from64, from64);
4195  const auto out_full = TableLookupBytes(bytes, idx_full);
4196  return Vec128<TI, NI>(LowerHalf(Half<decltype(d_full)>(), out_full).raw);
4197 }
4198 
4199 // Partial table vector
4200 template <typename T, size_t N, typename TI, HWY_IF_LE64(T, N)>
4202  const Vec128<TI> from) {
4203  const Full128<T> d_full;
4204  return TableLookupBytes(Combine(d_full, bytes, bytes), from);
4205 }
4206 
4207 // Partial both
4208 template <typename T, size_t N, typename TI, size_t NI, HWY_IF_LE64(T, N),
4209  HWY_IF_LE64(TI, NI)>
4211  Vec128<T, N> bytes, Vec128<TI, NI> from) {
4212  const Simd<T, N, 0> d;
4213  const Simd<TI, NI, 0> d_idx;
4214  const Repartition<uint8_t, decltype(d_idx)> d_idx8;
4215  // uint8x8
4216  const auto bytes8 = BitCast(Repartition<uint8_t, decltype(d)>(), bytes);
4217  const auto from8 = BitCast(d_idx8, from);
4218  const VFromD<decltype(d_idx8)> v8(vtbl1_u8(bytes8.raw, from8.raw));
4219  return BitCast(d_idx, v8);
4220 }
4221 
4222 // For all vector widths; ARM anyway zeroes if >= 0x10.
4223 template <class V, class VI>
4224 HWY_API VI TableLookupBytesOr0(const V bytes, const VI from) {
4225  return TableLookupBytes(bytes, from);
4226 }
4227 
4228 // ------------------------------ Scatter (Store)
4229 
4230 template <typename T, size_t N, typename Offset, HWY_IF_LE128(T, N)>
4232  T* HWY_RESTRICT base,
4233  const Vec128<Offset, N> offset) {
4234  static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
4235 
4236  alignas(16) T lanes[N];
4237  Store(v, d, lanes);
4238 
4239  alignas(16) Offset offset_lanes[N];
4240  Store(offset, Rebind<Offset, decltype(d)>(), offset_lanes);
4241 
4242  uint8_t* base_bytes = reinterpret_cast<uint8_t*>(base);
4243  for (size_t i = 0; i < N; ++i) {
4244  CopyBytes<sizeof(T)>(&lanes[i], base_bytes + offset_lanes[i]);
4245  }
4246 }
4247 
4248 template <typename T, size_t N, typename Index, HWY_IF_LE128(T, N)>
4250  const Vec128<Index, N> index) {
4251  static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
4252 
4253  alignas(16) T lanes[N];
4254  Store(v, d, lanes);
4255 
4256  alignas(16) Index index_lanes[N];
4257  Store(index, Rebind<Index, decltype(d)>(), index_lanes);
4258 
4259  for (size_t i = 0; i < N; ++i) {
4260  base[index_lanes[i]] = lanes[i];
4261  }
4262 }
4263 
4264 // ------------------------------ Gather (Load/Store)
4265 
4266 template <typename T, size_t N, typename Offset>
4268  const T* HWY_RESTRICT base,
4269  const Vec128<Offset, N> offset) {
4270  static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
4271 
4272  alignas(16) Offset offset_lanes[N];
4273  Store(offset, Rebind<Offset, decltype(d)>(), offset_lanes);
4274 
4275  alignas(16) T lanes[N];
4276  const uint8_t* base_bytes = reinterpret_cast<const uint8_t*>(base);
4277  for (size_t i = 0; i < N; ++i) {
4278  CopyBytes<sizeof(T)>(base_bytes + offset_lanes[i], &lanes[i]);
4279  }
4280  return Load(d, lanes);
4281 }
4282 
4283 template <typename T, size_t N, typename Index>
4285  const T* HWY_RESTRICT base,
4286  const Vec128<Index, N> index) {
4287  static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
4288 
4289  alignas(16) Index index_lanes[N];
4290  Store(index, Rebind<Index, decltype(d)>(), index_lanes);
4291 
4292  alignas(16) T lanes[N];
4293  for (size_t i = 0; i < N; ++i) {
4294  lanes[i] = base[index_lanes[i]];
4295  }
4296  return Load(d, lanes);
4297 }
4298 
4299 // ------------------------------ Reductions
4300 
4301 namespace detail {
4302 
4303 // N=1 for any T: no-op
4304 template <typename T>
4306  return v;
4307 }
4308 template <typename T>
4310  const Vec128<T, 1> v) {
4311  return v;
4312 }
4313 template <typename T>
4315  const Vec128<T, 1> v) {
4316  return v;
4317 }
4318 
4319 // u32/i32/f32: N=2
4320 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
4322  return v10 + Shuffle2301(v10);
4323 }
4324 template <typename T>
4326  const Vec128<T, 2> v10) {
4327  return Min(v10, Shuffle2301(v10));
4328 }
4329 template <typename T>
4331  const Vec128<T, 2> v10) {
4332  return Max(v10, Shuffle2301(v10));
4333 }
4334 
4335 // full vectors
4336 #if HWY_ARCH_ARM_A64
4338  return Vec128<uint32_t>(vdupq_n_u32(vaddvq_u32(v.raw)));
4339 }
4340 HWY_INLINE Vec128<int32_t> SumOfLanes(const Vec128<int32_t> v) {
4341  return Vec128<int32_t>(vdupq_n_s32(vaddvq_s32(v.raw)));
4342 }
4343 HWY_INLINE Vec128<float> SumOfLanes(const Vec128<float> v) {
4344  return Vec128<float>(vdupq_n_f32(vaddvq_f32(v.raw)));
4345 }
4346 HWY_INLINE Vec128<uint64_t> SumOfLanes(const Vec128<uint64_t> v) {
4347  return Vec128<uint64_t>(vdupq_n_u64(vaddvq_u64(v.raw)));
4348 }
4349 HWY_INLINE Vec128<int64_t> SumOfLanes(const Vec128<int64_t> v) {
4350  return Vec128<int64_t>(vdupq_n_s64(vaddvq_s64(v.raw)));
4351 }
4352 HWY_INLINE Vec128<double> SumOfLanes(const Vec128<double> v) {
4353  return Vec128<double>(vdupq_n_f64(vaddvq_f64(v.raw)));
4354 }
4355 #else
4356 // ARMv7 version for everything except doubles.
4358  uint32x4x2_t v0 = vuzpq_u32(v.raw, v.raw);
4359  uint32x4_t c0 = vaddq_u32(v0.val[0], v0.val[1]);
4360  uint32x4x2_t v1 = vuzpq_u32(c0, c0);
4361  return Vec128<uint32_t>(vaddq_u32(v1.val[0], v1.val[1]));
4362 }
4364  int32x4x2_t v0 = vuzpq_s32(v.raw, v.raw);
4365  int32x4_t c0 = vaddq_s32(v0.val[0], v0.val[1]);
4366  int32x4x2_t v1 = vuzpq_s32(c0, c0);
4367  return Vec128<int32_t>(vaddq_s32(v1.val[0], v1.val[1]));
4368 }
4370  float32x4x2_t v0 = vuzpq_f32(v.raw, v.raw);
4371  float32x4_t c0 = vaddq_f32(v0.val[0], v0.val[1]);
4372  float32x4x2_t v1 = vuzpq_f32(c0, c0);
4373  return Vec128<float>(vaddq_f32(v1.val[0], v1.val[1]));
4374 }
4376  return v + Shuffle01(v);
4377 }
4379  return v + Shuffle01(v);
4380 }
4381 #endif
4382 
4383 template <typename T>
4385  const Vec128<T> v3210) {
4386  const Vec128<T> v1032 = Shuffle1032(v3210);
4387  const Vec128<T> v31_20_31_20 = Min(v3210, v1032);
4388  const Vec128<T> v20_31_20_31 = Shuffle0321(v31_20_31_20);
4389  return Min(v20_31_20_31, v31_20_31_20);
4390 }
4391 template <typename T>
4393  const Vec128<T> v3210) {
4394  const Vec128<T> v1032 = Shuffle1032(v3210);
4395  const Vec128<T> v31_20_31_20 = Max(v3210, v1032);
4396  const Vec128<T> v20_31_20_31 = Shuffle0321(v31_20_31_20);
4397  return Max(v20_31_20_31, v31_20_31_20);
4398 }
4399 
4400 // For u64/i64[/f64].
4401 template <typename T>
4403  const Vec128<T> v10) {
4404  const Vec128<T> v01 = Shuffle01(v10);
4405  return Min(v10, v01);
4406 }
4407 template <typename T>
4409  const Vec128<T> v10) {
4410  const Vec128<T> v01 = Shuffle01(v10);
4411  return Max(v10, v01);
4412 }
4413 
4414 // u16/i16
4415 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2), HWY_IF_GE32(T, N)>
4418  const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF));
4419  const auto odd = ShiftRight<16>(BitCast(d32, v));
4420  const auto min = MinOfLanes(d32, Min(even, odd));
4421  // Also broadcast into odd lanes.
4422  return BitCast(Simd<T, N, 0>(), Or(min, ShiftLeft<16>(min)));
4423 }
4424 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2), HWY_IF_GE32(T, N)>
4427  const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF));
4428  const auto odd = ShiftRight<16>(BitCast(d32, v));
4429  const auto min = MaxOfLanes(d32, Max(even, odd));
4430  // Also broadcast into odd lanes.
4431  return BitCast(Simd<T, N, 0>(), Or(min, ShiftLeft<16>(min)));
4432 }
4433 
4434 } // namespace detail
4435 
4436 template <typename T, size_t N>
4438  return detail::SumOfLanes(v);
4439 }
4440 template <typename T, size_t N>
4442  return detail::MinOfLanes(hwy::SizeTag<sizeof(T)>(), v);
4443 }
4444 template <typename T, size_t N>
4446  return detail::MaxOfLanes(hwy::SizeTag<sizeof(T)>(), v);
4447 }
4448 
4449 // ------------------------------ LoadMaskBits (TestBit)
4450 
4451 namespace detail {
4452 
4453 // Helper function to set 64 bits and potentially return a smaller vector. The
4454 // overload is required to call the q vs non-q intrinsics. Note that 8-bit
4455 // LoadMaskBits only requires 16 bits, but 64 avoids casting.
4456 template <typename T, size_t N, HWY_IF_LE64(T, N)>
4457 HWY_INLINE Vec128<T, N> Set64(Simd<T, N, 0> /* tag */, uint64_t mask_bits) {
4458  const auto v64 = Vec64<uint64_t>(vdup_n_u64(mask_bits));
4459  return Vec128<T, N>(BitCast(Full64<T>(), v64).raw);
4460 }
4461 template <typename T>
4462 HWY_INLINE Vec128<T> Set64(Full128<T> d, uint64_t mask_bits) {
4463  return BitCast(d, Vec128<uint64_t>(vdupq_n_u64(mask_bits)));
4464 }
4465 
4466 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 1)>
4468  const RebindToUnsigned<decltype(d)> du;
4469  // Easier than Set(), which would require an >8-bit type, which would not
4470  // compile for T=uint8_t, N=1.
4471  const auto vmask_bits = Set64(du, mask_bits);
4472 
4473  // Replicate bytes 8x such that each byte contains the bit that governs it.
4474  alignas(16) constexpr uint8_t kRep8[16] = {0, 0, 0, 0, 0, 0, 0, 0,
4475  1, 1, 1, 1, 1, 1, 1, 1};
4476  const auto rep8 = TableLookupBytes(vmask_bits, Load(du, kRep8));
4477 
4478  alignas(16) constexpr uint8_t kBit[16] = {1, 2, 4, 8, 16, 32, 64, 128,
4479  1, 2, 4, 8, 16, 32, 64, 128};
4480  return RebindMask(d, TestBit(rep8, LoadDup128(du, kBit)));
4481 }
4482 
4483 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
4484 HWY_INLINE Mask128<T, N> LoadMaskBits(Simd<T, N, 0> d, uint64_t mask_bits) {
4485  const RebindToUnsigned<decltype(d)> du;
4486  alignas(16) constexpr uint16_t kBit[8] = {1, 2, 4, 8, 16, 32, 64, 128};
4487  const auto vmask_bits = Set(du, static_cast<uint16_t>(mask_bits));
4488  return RebindMask(d, TestBit(vmask_bits, Load(du, kBit)));
4489 }
4490 
4491 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
4492 HWY_INLINE Mask128<T, N> LoadMaskBits(Simd<T, N, 0> d, uint64_t mask_bits) {
4493  const RebindToUnsigned<decltype(d)> du;
4494  alignas(16) constexpr uint32_t kBit[8] = {1, 2, 4, 8};
4495  const auto vmask_bits = Set(du, static_cast<uint32_t>(mask_bits));
4496  return RebindMask(d, TestBit(vmask_bits, Load(du, kBit)));
4497 }
4498 
4499 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
4500 HWY_INLINE Mask128<T, N> LoadMaskBits(Simd<T, N, 0> d, uint64_t mask_bits) {
4501  const RebindToUnsigned<decltype(d)> du;
4502  alignas(16) constexpr uint64_t kBit[8] = {1, 2};
4503  return RebindMask(d, TestBit(Set(du, mask_bits), Load(du, kBit)));
4504 }
4505 
4506 } // namespace detail
4507 
4508 // `p` points to at least 8 readable bytes, not all of which need be valid.
4509 template <typename T, size_t N, HWY_IF_LE128(T, N)>
4511  const uint8_t* HWY_RESTRICT bits) {
4512  uint64_t mask_bits = 0;
4513  CopyBytes<(N + 7) / 8>(bits, &mask_bits);
4514  return detail::LoadMaskBits(d, mask_bits);
4515 }
4516 
4517 // ------------------------------ Mask
4518 
4519 namespace detail {
4520 
4521 template <typename T>
4523  const Mask128<T> mask) {
4524  alignas(16) constexpr uint8_t kSliceLanes[16] = {
4525  1, 2, 4, 8, 0x10, 0x20, 0x40, 0x80, 1, 2, 4, 8, 0x10, 0x20, 0x40, 0x80,
4526  };
4527  const Full128<uint8_t> du;
4528  const Vec128<uint8_t> values =
4529  BitCast(du, VecFromMask(Full128<T>(), mask)) & Load(du, kSliceLanes);
4530 
4531 #if HWY_ARCH_ARM_A64
4532  // Can't vaddv - we need two separate bytes (16 bits).
4533  const uint8x8_t x2 = vget_low_u8(vpaddq_u8(values.raw, values.raw));
4534  const uint8x8_t x4 = vpadd_u8(x2, x2);
4535  const uint8x8_t x8 = vpadd_u8(x4, x4);
4536  return vget_lane_u64(vreinterpret_u64_u8(x8), 0);
4537 #else
4538  // Don't have vpaddq, so keep doubling lane size.
4539  const uint16x8_t x2 = vpaddlq_u8(values.raw);
4540  const uint32x4_t x4 = vpaddlq_u16(x2);
4541  const uint64x2_t x8 = vpaddlq_u32(x4);
4542  return (vgetq_lane_u64(x8, 1) << 8) | vgetq_lane_u64(x8, 0);
4543 #endif
4544 }
4545 
4546 template <typename T, size_t N, HWY_IF_LE64(T, N)>
4548  const Mask128<T, N> mask) {
4549  // Upper lanes of partial loads are undefined. OnlyActive will fix this if
4550  // we load all kSliceLanes so the upper lanes do not pollute the valid bits.
4551  alignas(8) constexpr uint8_t kSliceLanes[8] = {1, 2, 4, 8,
4552  0x10, 0x20, 0x40, 0x80};
4553  const Simd<T, N, 0> d;
4554  const RebindToUnsigned<decltype(d)> du;
4555  const Vec128<uint8_t, N> slice(Load(Full64<uint8_t>(), kSliceLanes).raw);
4556  const Vec128<uint8_t, N> values = BitCast(du, VecFromMask(d, mask)) & slice;
4557 
4558 #if HWY_ARCH_ARM_A64
4559  return vaddv_u8(values.raw);
4560 #else
4561  const uint16x4_t x2 = vpaddl_u8(values.raw);
4562  const uint32x2_t x4 = vpaddl_u16(x2);
4563  const uint64x1_t x8 = vpaddl_u32(x4);
4564  return vget_lane_u64(x8, 0);
4565 #endif
4566 }
4567 
4568 template <typename T>
4570  const Mask128<T> mask) {
4571  alignas(16) constexpr uint16_t kSliceLanes[8] = {1, 2, 4, 8,
4572  0x10, 0x20, 0x40, 0x80};
4573  const Full128<T> d;
4574  const Full128<uint16_t> du;
4575  const Vec128<uint16_t> values =
4576  BitCast(du, VecFromMask(d, mask)) & Load(du, kSliceLanes);
4577 #if HWY_ARCH_ARM_A64
4578  return vaddvq_u16(values.raw);
4579 #else
4580  const uint32x4_t x2 = vpaddlq_u16(values.raw);
4581  const uint64x2_t x4 = vpaddlq_u32(x2);
4582  return vgetq_lane_u64(x4, 0) + vgetq_lane_u64(x4, 1);
4583 #endif
4584 }
4585 
4586 template <typename T, size_t N, HWY_IF_LE64(T, N)>
4588  const Mask128<T, N> mask) {
4589  // Upper lanes of partial loads are undefined. OnlyActive will fix this if
4590  // we load all kSliceLanes so the upper lanes do not pollute the valid bits.
4591  alignas(8) constexpr uint16_t kSliceLanes[4] = {1, 2, 4, 8};
4592  const Simd<T, N, 0> d;
4593  const RebindToUnsigned<decltype(d)> du;
4594  const Vec128<uint16_t, N> slice(Load(Full64<uint16_t>(), kSliceLanes).raw);
4595  const Vec128<uint16_t, N> values = BitCast(du, VecFromMask(d, mask)) & slice;
4596 #if HWY_ARCH_ARM_A64
4597  return vaddv_u16(values.raw);
4598 #else
4599  const uint32x2_t x2 = vpaddl_u16(values.raw);
4600  const uint64x1_t x4 = vpaddl_u32(x2);
4601  return vget_lane_u64(x4, 0);
4602 #endif
4603 }
4604 
4605 template <typename T>
4607  const Mask128<T> mask) {
4608  alignas(16) constexpr uint32_t kSliceLanes[4] = {1, 2, 4, 8};
4609  const Full128<T> d;
4610  const Full128<uint32_t> du;
4611  const Vec128<uint32_t> values =
4612  BitCast(du, VecFromMask(d, mask)) & Load(du, kSliceLanes);
4613 #if HWY_ARCH_ARM_A64
4614  return vaddvq_u32(values.raw);
4615 #else
4616  const uint64x2_t x2 = vpaddlq_u32(values.raw);
4617  return vgetq_lane_u64(x2, 0) + vgetq_lane_u64(x2, 1);
4618 #endif
4619 }
4620 
4621 template <typename T, size_t N, HWY_IF_LE64(T, N)>
4623  const Mask128<T, N> mask) {
4624  // Upper lanes of partial loads are undefined. OnlyActive will fix this if
4625  // we load all kSliceLanes so the upper lanes do not pollute the valid bits.
4626  alignas(8) constexpr uint32_t kSliceLanes[2] = {1, 2};
4627  const Simd<T, N, 0> d;
4628  const RebindToUnsigned<decltype(d)> du;
4629  const Vec128<uint32_t, N> slice(Load(Full64<uint32_t>(), kSliceLanes).raw);
4630  const Vec128<uint32_t, N> values = BitCast(du, VecFromMask(d, mask)) & slice;
4631 #if HWY_ARCH_ARM_A64
4632  return vaddv_u32(values.raw);
4633 #else
4634  const uint64x1_t x2 = vpaddl_u32(values.raw);
4635  return vget_lane_u64(x2, 0);
4636 #endif
4637 }
4638 
4639 template <typename T>
4641  alignas(16) constexpr uint64_t kSliceLanes[2] = {1, 2};
4642  const Full128<T> d;
4643  const Full128<uint64_t> du;
4644  const Vec128<uint64_t> values =
4645  BitCast(du, VecFromMask(d, m)) & Load(du, kSliceLanes);
4646 #if HWY_ARCH_ARM_A64
4647  return vaddvq_u64(values.raw);
4648 #else
4649  return vgetq_lane_u64(values.raw, 0) + vgetq_lane_u64(values.raw, 1);
4650 #endif
4651 }
4652 
4653 template <typename T>
4655  const Mask128<T, 1> m) {
4656  const Full64<T> d;
4657  const Full64<uint64_t> du;
4658  const Vec64<uint64_t> values = BitCast(du, VecFromMask(d, m)) & Set(du, 1);
4659  return vget_lane_u64(values.raw, 0);
4660 }
4661 
4662 // Returns the lowest N for the BitsFromMask result.
4663 template <typename T, size_t N>
4664 constexpr uint64_t OnlyActive(uint64_t bits) {
4665  return ((N * sizeof(T)) >= 8) ? bits : (bits & ((1ull << N) - 1));
4666 }
4667 
4668 template <typename T, size_t N>
4669 HWY_INLINE uint64_t BitsFromMask(const Mask128<T, N> mask) {
4670  return OnlyActive<T, N>(BitsFromMask(hwy::SizeTag<sizeof(T)>(), mask));
4671 }
4672 
4673 // Returns number of lanes whose mask is set.
4674 //
4675 // Masks are either FF..FF or 0. Unfortunately there is no reduce-sub op
4676 // ("vsubv"). ANDing with 1 would work but requires a constant. Negating also
4677 // changes each lane to 1 (if mask set) or 0.
4678 
4679 template <typename T>
4680 HWY_INLINE size_t CountTrue(hwy::SizeTag<1> /*tag*/, const Mask128<T> mask) {
4681  const Full128<int8_t> di;
4682  const int8x16_t ones =
4683  vnegq_s8(BitCast(di, VecFromMask(Full128<T>(), mask)).raw);
4684 
4685 #if HWY_ARCH_ARM_A64
4686  return static_cast<size_t>(vaddvq_s8(ones));
4687 #else
4688  const int16x8_t x2 = vpaddlq_s8(ones);
4689  const int32x4_t x4 = vpaddlq_s16(x2);
4690  const int64x2_t x8 = vpaddlq_s32(x4);
4691  return static_cast<size_t>(vgetq_lane_s64(x8, 0) + vgetq_lane_s64(x8, 1));
4692 #endif
4693 }
4694 template <typename T>
4695 HWY_INLINE size_t CountTrue(hwy::SizeTag<2> /*tag*/, const Mask128<T> mask) {
4696  const Full128<int16_t> di;
4697  const int16x8_t ones =
4698  vnegq_s16(BitCast(di, VecFromMask(Full128<T>(), mask)).raw);
4699 
4700 #if HWY_ARCH_ARM_A64
4701  return static_cast<size_t>(vaddvq_s16(ones));
4702 #else
4703  const int32x4_t x2 = vpaddlq_s16(ones);
4704  const int64x2_t x4 = vpaddlq_s32(x2);
4705  return static_cast<size_t>(vgetq_lane_s64(x4, 0) + vgetq_lane_s64(x4, 1));
4706 #endif
4707 }
4708 
4709 template <typename T>
4710 HWY_INLINE size_t CountTrue(hwy::SizeTag<4> /*tag*/, const Mask128<T> mask) {
4711  const Full128<int32_t> di;
4712  const int32x4_t ones =
4713  vnegq_s32(BitCast(di, VecFromMask(Full128<T>(), mask)).raw);
4714 
4715 #if HWY_ARCH_ARM_A64
4716  return static_cast<size_t>(vaddvq_s32(ones));
4717 #else
4718  const int64x2_t x2 = vpaddlq_s32(ones);
4719  return static_cast<size_t>(vgetq_lane_s64(x2, 0) + vgetq_lane_s64(x2, 1));
4720 #endif
4721 }
4722 
4723 template <typename T>
4724 HWY_INLINE size_t CountTrue(hwy::SizeTag<8> /*tag*/, const Mask128<T> mask) {
4725 #if HWY_ARCH_ARM_A64
4726  const Full128<int64_t> di;
4727  const int64x2_t ones =
4728  vnegq_s64(BitCast(di, VecFromMask(Full128<T>(), mask)).raw);
4729  return static_cast<size_t>(vaddvq_s64(ones));
4730 #else
4731  const Full128<uint64_t> du;
4732  const auto mask_u = VecFromMask(du, RebindMask(du, mask));
4733  const uint64x2_t ones = vshrq_n_u64(mask_u.raw, 63);
4734  return static_cast<size_t>(vgetq_lane_u64(ones, 0) + vgetq_lane_u64(ones, 1));
4735 #endif
4736 }
4737 
4738 } // namespace detail
4739 
4740 // Full
4741 template <typename T>
4742 HWY_API size_t CountTrue(Full128<T> /* tag */, const Mask128<T> mask) {
4743  return detail::CountTrue(hwy::SizeTag<sizeof(T)>(), mask);
4744 }
4745 
4746 // Partial
4747 template <typename T, size_t N, HWY_IF_LE64(T, N)>
4748 HWY_API size_t CountTrue(Simd<T, N, 0> /* tag */, const Mask128<T, N> mask) {
4749  return PopCount(detail::BitsFromMask(mask));
4750 }
4751 
4752 template <typename T, size_t N>
4753 HWY_API intptr_t FindFirstTrue(const Simd<T, N, 0> /* tag */,
4754  const Mask128<T, N> mask) {
4755  const uint64_t bits = detail::BitsFromMask(mask);
4756  return bits ? static_cast<intptr_t>(Num0BitsBelowLS1Bit_Nonzero64(bits)) : -1;
4757 }
4758 
4759 // `p` points to at least 8 writable bytes.
4760 template <typename T, size_t N>
4761 HWY_API size_t StoreMaskBits(Simd<T, N, 0> /* tag */, const Mask128<T, N> mask,
4762  uint8_t* bits) {
4763  const uint64_t mask_bits = detail::BitsFromMask(mask);
4764  const size_t kNumBytes = (N + 7) / 8;
4765  CopyBytes<kNumBytes>(&mask_bits, bits);
4766  return kNumBytes;
4767 }
4768 
4769 // Full
4770 template <typename T>
4771 HWY_API bool AllFalse(const Full128<T> d, const Mask128<T> m) {
4772 #if HWY_ARCH_ARM_A64
4773  const Full128<uint32_t> d32;
4774  const auto m32 = MaskFromVec(BitCast(d32, VecFromMask(d, m)));
4775  return (vmaxvq_u32(m32.raw) == 0);
4776 #else
4777  const auto v64 = BitCast(Full128<uint64_t>(), VecFromMask(d, m));
4778  uint32x2_t a = vqmovn_u64(v64.raw);
4779  return vget_lane_u64(vreinterpret_u64_u32(a), 0) == 0;
4780 #endif
4781 }
4782 
4783 // Partial
4784 template <typename T, size_t N, HWY_IF_LE64(T, N)>
4785 HWY_API bool AllFalse(const Simd<T, N, 0> /* tag */, const Mask128<T, N> m) {
4786  return detail::BitsFromMask(m) == 0;
4787 }
4788 
4789 template <typename T, size_t N>
4791  return AllFalse(d, VecFromMask(d, m) == Zero(d));
4792 }
4793 
4794 // ------------------------------ Compress
4795 
4796 template <typename T>
4798  enum { value = 1 };
4799 };
4800 
4801 namespace detail {
4802 
4803 // Load 8 bytes, replicate into upper half so ZipLower can use the lower half.
4805  const uint8_t* bytes) {
4806  return Vec128<uint8_t>(vreinterpretq_u8_u64(
4807  vld1q_dup_u64(reinterpret_cast<const uint64_t*>(bytes))));
4808 }
4809 
4810 // Load 8 bytes and return half-reg with N <= 8 bytes.
4811 template <size_t N, HWY_IF_LE64(uint8_t, N)>
4813  const uint8_t* bytes) {
4814  return Load(d, bytes);
4815 }
4816 
4817 template <typename T, size_t N>
4819  const uint64_t mask_bits) {
4820  HWY_DASSERT(mask_bits < 256);
4821  const Simd<T, N, 0> d;
4822  const Repartition<uint8_t, decltype(d)> d8;
4823  const Simd<uint16_t, N, 0> du;
4824 
4825  // ARM does not provide an equivalent of AVX2 permutevar, so we need byte
4826  // indices for VTBL (one vector's worth for each of 256 combinations of
4827  // 8 mask bits). Loading them directly would require 4 KiB. We can instead
4828  // store lane indices and convert to byte indices (2*lane + 0..1), with the
4829  // doubling baked into the table. AVX2 Compress32 stores eight 4-bit lane
4830  // indices (total 1 KiB), broadcasts them into each 32-bit lane and shifts.
4831  // Here, 16-bit lanes are too narrow to hold all bits, and unpacking nibbles
4832  // is likely more costly than the higher cache footprint from storing bytes.
4833  alignas(16) constexpr uint8_t table[256 * 8] = {
4834  0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14, //
4835  2, 0, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14, //
4836  4, 0, 2, 6, 8, 10, 12, 14, 0, 4, 2, 6, 8, 10, 12, 14, //
4837  2, 4, 0, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14, //
4838  6, 0, 2, 4, 8, 10, 12, 14, 0, 6, 2, 4, 8, 10, 12, 14, //
4839  2, 6, 0, 4, 8, 10, 12, 14, 0, 2, 6, 4, 8, 10, 12, 14, //
4840  4, 6, 0, 2, 8, 10, 12, 14, 0, 4, 6, 2, 8, 10, 12, 14, //
4841  2, 4, 6, 0, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14, //
4842  8, 0, 2, 4, 6, 10, 12, 14, 0, 8, 2, 4, 6, 10, 12, 14, //
4843  2, 8, 0, 4, 6, 10, 12, 14, 0, 2, 8, 4, 6, 10, 12, 14, //
4844  4, 8, 0, 2, 6, 10, 12, 14, 0, 4, 8, 2, 6, 10, 12, 14, //
4845  2, 4, 8, 0, 6, 10, 12, 14, 0, 2, 4, 8, 6, 10, 12, 14, //
4846  6, 8, 0, 2, 4, 10, 12, 14, 0, 6, 8, 2, 4, 10, 12, 14, //
4847  2, 6, 8, 0, 4, 10, 12, 14, 0, 2, 6, 8, 4, 10, 12, 14, //
4848  4, 6, 8, 0, 2, 10, 12, 14, 0, 4, 6, 8, 2, 10, 12, 14, //
4849  2, 4, 6, 8, 0, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14, //
4850  10, 0, 2, 4, 6, 8, 12, 14, 0, 10, 2, 4, 6, 8, 12, 14, //
4851  2, 10, 0, 4, 6, 8, 12, 14, 0, 2, 10, 4, 6, 8, 12, 14, //
4852  4, 10, 0, 2, 6, 8, 12, 14, 0, 4, 10, 2, 6, 8, 12, 14, //
4853  2, 4, 10, 0, 6, 8, 12, 14, 0, 2, 4, 10, 6, 8, 12, 14, //
4854  6, 10, 0, 2, 4, 8, 12, 14, 0, 6, 10, 2, 4, 8, 12, 14, //
4855  2, 6, 10, 0, 4, 8, 12, 14, 0, 2, 6, 10, 4, 8, 12, 14, //
4856  4, 6, 10, 0, 2, 8, 12, 14, 0, 4, 6, 10, 2, 8, 12, 14, //
4857  2, 4, 6, 10, 0, 8, 12, 14, 0, 2, 4, 6, 10, 8, 12, 14, //
4858  8, 10, 0, 2, 4, 6, 12, 14, 0, 8, 10, 2, 4, 6, 12, 14, //
4859  2, 8, 10, 0, 4, 6, 12, 14, 0, 2, 8, 10, 4, 6, 12, 14, //
4860  4, 8, 10, 0, 2, 6, 12, 14, 0, 4, 8, 10, 2, 6, 12, 14, //
4861  2, 4, 8, 10, 0, 6, 12, 14, 0, 2, 4, 8, 10, 6, 12, 14, //
4862  6, 8, 10, 0, 2, 4, 12, 14, 0, 6, 8, 10, 2, 4, 12, 14, //
4863  2, 6, 8, 10, 0, 4, 12, 14, 0, 2, 6, 8, 10, 4, 12, 14, //
4864  4, 6, 8, 10, 0, 2, 12, 14, 0, 4, 6, 8, 10, 2, 12, 14, //
4865  2, 4, 6, 8, 10, 0, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14, //
4866  12, 0, 2, 4, 6, 8, 10, 14, 0, 12, 2, 4, 6, 8, 10, 14, //
4867  2, 12, 0, 4, 6, 8, 10, 14, 0, 2, 12, 4, 6, 8, 10, 14, //
4868  4, 12, 0, 2, 6, 8, 10, 14, 0, 4, 12, 2, 6, 8, 10, 14, //
4869  2, 4, 12, 0, 6, 8, 10, 14, 0, 2, 4, 12, 6, 8, 10, 14, //
4870  6, 12, 0, 2, 4, 8, 10, 14, 0, 6, 12, 2, 4, 8, 10, 14, //
4871  2, 6, 12, 0, 4, 8, 10, 14, 0, 2, 6, 12, 4, 8, 10, 14, //
4872  4, 6, 12, 0, 2, 8, 10, 14, 0, 4, 6, 12, 2, 8, 10, 14, //
4873  2, 4, 6, 12, 0, 8, 10, 14, 0, 2, 4, 6, 12, 8, 10, 14, //
4874  8, 12, 0, 2, 4, 6, 10, 14, 0, 8, 12, 2, 4, 6, 10, 14, //
4875  2, 8, 12, 0, 4, 6, 10, 14, 0, 2, 8, 12, 4, 6, 10, 14, //
4876  4, 8, 12, 0, 2, 6, 10, 14, 0, 4, 8, 12, 2, 6, 10, 14, //
4877  2, 4, 8, 12, 0, 6, 10, 14, 0, 2, 4, 8, 12, 6, 10, 14, //
4878  6, 8, 12, 0, 2, 4, 10, 14, 0, 6, 8, 12, 2, 4, 10, 14, //
4879  2, 6, 8, 12, 0, 4, 10, 14, 0, 2, 6, 8, 12, 4, 10, 14, //
4880  4, 6, 8, 12, 0, 2, 10, 14, 0, 4, 6, 8, 12, 2, 10, 14, //
4881  2, 4, 6, 8, 12, 0, 10, 14, 0, 2, 4, 6, 8, 12, 10, 14, //
4882  10, 12, 0, 2, 4, 6, 8, 14, 0, 10, 12, 2, 4, 6, 8, 14, //
4883  2, 10, 12, 0, 4, 6, 8, 14, 0, 2, 10, 12, 4, 6, 8, 14, //
4884  4, 10, 12, 0, 2, 6, 8, 14, 0, 4, 10, 12, 2, 6, 8, 14, //
4885  2, 4, 10, 12, 0, 6, 8, 14, 0, 2, 4, 10, 12, 6, 8, 14, //
4886  6, 10, 12, 0, 2, 4, 8, 14, 0, 6, 10, 12, 2, 4, 8, 14, //
4887  2, 6, 10, 12, 0, 4, 8, 14, 0, 2, 6, 10, 12, 4, 8, 14, //
4888  4, 6, 10, 12, 0, 2, 8, 14, 0, 4, 6, 10, 12, 2, 8, 14, //
4889  2, 4, 6, 10, 12, 0, 8, 14, 0, 2, 4, 6, 10, 12, 8, 14, //
4890  8, 10, 12, 0, 2, 4, 6, 14, 0, 8, 10, 12, 2, 4, 6, 14, //
4891  2, 8, 10, 12, 0, 4, 6, 14, 0, 2, 8, 10, 12, 4, 6, 14, //
4892  4, 8, 10, 12, 0, 2, 6, 14, 0, 4, 8, 10, 12, 2, 6, 14, //
4893  2, 4, 8, 10, 12, 0, 6, 14, 0, 2, 4, 8, 10, 12, 6, 14, //
4894  6, 8, 10, 12, 0, 2, 4, 14, 0, 6, 8, 10, 12, 2, 4, 14, //
4895  2, 6, 8, 10, 12, 0, 4, 14, 0, 2, 6, 8, 10, 12, 4, 14, //
4896  4, 6, 8, 10, 12, 0, 2, 14, 0, 4, 6, 8, 10, 12, 2, 14, //
4897  2, 4, 6, 8, 10, 12, 0, 14, 0, 2, 4, 6, 8, 10, 12, 14, //
4898  14, 0, 2, 4, 6, 8, 10, 12, 0, 14, 2, 4, 6, 8, 10, 12, //
4899  2, 14, 0, 4, 6, 8, 10, 12, 0, 2, 14, 4, 6, 8, 10, 12, //
4900  4, 14, 0, 2, 6, 8, 10, 12, 0, 4, 14, 2, 6, 8, 10, 12, //
4901  2, 4, 14, 0, 6, 8, 10, 12, 0, 2, 4, 14, 6, 8, 10, 12, //
4902  6, 14, 0, 2, 4, 8, 10, 12, 0, 6, 14, 2, 4, 8, 10, 12, //
4903  2, 6, 14, 0, 4, 8, 10, 12, 0, 2, 6, 14, 4, 8, 10, 12, //
4904  4, 6, 14, 0, 2, 8, 10, 12, 0, 4, 6, 14, 2, 8, 10, 12, //
4905  2, 4, 6, 14, 0, 8, 10, 12, 0, 2, 4, 6, 14, 8, 10, 12, //
4906  8, 14, 0, 2, 4, 6, 10, 12, 0, 8, 14, 2, 4, 6, 10, 12, //
4907  2, 8, 14, 0, 4, 6, 10, 12, 0, 2, 8, 14, 4, 6, 10, 12, //
4908  4, 8, 14, 0, 2, 6, 10, 12, 0, 4, 8, 14, 2, 6, 10, 12, //
4909  2, 4, 8, 14, 0, 6, 10, 12, 0, 2, 4, 8, 14, 6, 10, 12, //
4910  6, 8, 14, 0, 2, 4, 10, 12, 0, 6, 8, 14, 2, 4, 10, 12, //
4911  2, 6, 8, 14, 0, 4, 10, 12, 0, 2, 6, 8, 14, 4, 10, 12, //
4912  4, 6, 8, 14, 0, 2, 10, 12, 0, 4, 6, 8, 14, 2, 10, 12, //
4913  2, 4, 6, 8, 14, 0, 10, 12, 0, 2, 4, 6, 8, 14, 10, 12, //
4914  10, 14, 0, 2, 4, 6, 8, 12, 0, 10, 14, 2, 4, 6, 8, 12, //
4915  2, 10, 14, 0, 4, 6, 8, 12, 0, 2, 10, 14, 4, 6, 8, 12, //
4916  4, 10, 14, 0, 2, 6, 8, 12, 0, 4, 10, 14, 2, 6, 8, 12, //
4917  2, 4, 10, 14, 0, 6, 8, 12, 0, 2, 4, 10, 14, 6, 8, 12, //
4918  6, 10, 14, 0, 2, 4, 8, 12, 0, 6, 10, 14, 2, 4, 8, 12, //
4919  2, 6, 10, 14, 0, 4, 8, 12, 0, 2, 6, 10, 14, 4, 8, 12, //
4920  4, 6, 10, 14, 0, 2, 8, 12, 0, 4, 6, 10, 14, 2, 8, 12, //
4921  2, 4, 6, 10, 14, 0, 8, 12, 0, 2, 4, 6, 10, 14, 8, 12, //
4922  8, 10, 14, 0, 2, 4, 6, 12, 0, 8, 10, 14, 2, 4, 6, 12, //
4923  2, 8, 10, 14, 0, 4, 6, 12, 0, 2, 8, 10, 14, 4, 6, 12, //
4924  4, 8, 10, 14, 0, 2, 6, 12, 0, 4, 8, 10, 14, 2, 6, 12, //
4925  2, 4, 8, 10, 14, 0, 6, 12, 0, 2, 4, 8, 10, 14, 6, 12, //
4926  6, 8, 10, 14, 0, 2, 4, 12, 0, 6, 8, 10, 14, 2, 4, 12, //
4927  2, 6, 8, 10, 14, 0, 4, 12, 0, 2, 6, 8, 10, 14, 4, 12, //
4928  4, 6, 8, 10, 14, 0, 2, 12, 0, 4, 6, 8, 10, 14, 2, 12, //
4929  2, 4, 6, 8, 10, 14, 0, 12, 0, 2, 4, 6, 8, 10, 14, 12, //
4930  12, 14, 0, 2, 4, 6, 8, 10, 0, 12, 14, 2, 4, 6, 8, 10, //
4931  2, 12, 14, 0, 4, 6, 8, 10, 0, 2, 12, 14, 4, 6, 8, 10, //
4932  4, 12, 14, 0, 2, 6, 8, 10, 0, 4, 12, 14, 2, 6, 8, 10, //
4933  2, 4, 12, 14, 0, 6, 8, 10, 0, 2, 4, 12, 14, 6, 8, 10, //
4934  6, 12, 14, 0, 2, 4, 8, 10, 0, 6, 12, 14, 2, 4, 8, 10, //
4935  2, 6, 12, 14, 0, 4, 8, 10, 0, 2, 6, 12, 14, 4, 8, 10, //
4936  4, 6, 12, 14, 0, 2, 8, 10, 0, 4, 6, 12, 14, 2, 8, 10, //
4937  2, 4, 6, 12, 14, 0, 8, 10, 0, 2, 4, 6, 12, 14, 8, 10, //
4938  8, 12, 14, 0, 2, 4, 6, 10, 0, 8, 12, 14, 2, 4, 6, 10, //
4939  2, 8, 12, 14, 0, 4, 6, 10, 0, 2, 8, 12, 14, 4, 6, 10, //
4940  4, 8, 12, 14, 0, 2, 6, 10, 0, 4, 8, 12, 14, 2, 6, 10, //
4941  2, 4, 8, 12, 14, 0, 6, 10, 0, 2, 4, 8, 12, 14, 6, 10, //
4942  6, 8, 12, 14, 0, 2, 4, 10, 0, 6, 8, 12, 14, 2, 4, 10, //
4943  2, 6, 8, 12, 14, 0, 4, 10, 0, 2, 6, 8, 12, 14, 4, 10, //
4944  4, 6, 8, 12, 14, 0, 2, 10, 0, 4, 6, 8, 12, 14, 2, 10, //
4945  2, 4, 6, 8, 12, 14, 0, 10, 0, 2, 4, 6, 8, 12, 14, 10, //
4946  10, 12, 14, 0, 2, 4, 6, 8, 0, 10, 12, 14, 2, 4, 6, 8, //
4947  2, 10, 12, 14, 0, 4, 6, 8, 0, 2, 10, 12, 14, 4, 6, 8, //
4948  4, 10, 12, 14, 0, 2, 6, 8, 0, 4, 10, 12, 14, 2, 6, 8, //
4949  2, 4, 10, 12, 14, 0, 6, 8, 0, 2, 4, 10, 12, 14, 6, 8, //
4950  6, 10, 12, 14, 0, 2, 4, 8, 0, 6, 10, 12, 14, 2, 4, 8, //
4951  2, 6, 10, 12, 14, 0, 4, 8, 0, 2, 6, 10, 12, 14, 4, 8, //
4952  4, 6, 10, 12, 14, 0, 2, 8, 0, 4, 6, 10, 12, 14, 2, 8, //
4953  2, 4, 6, 10, 12, 14, 0, 8, 0, 2, 4, 6, 10, 12, 14, 8, //
4954  8, 10, 12, 14, 0, 2, 4, 6, 0, 8, 10, 12, 14, 2, 4, 6, //
4955  2, 8, 10, 12, 14, 0, 4, 6, 0, 2, 8, 10, 12, 14, 4, 6, //
4956  4, 8, 10, 12, 14, 0, 2, 6, 0, 4, 8, 10, 12, 14, 2, 6, //
4957  2, 4, 8, 10, 12, 14, 0, 6, 0, 2, 4, 8, 10, 12, 14, 6, //
4958  6, 8, 10, 12, 14, 0, 2, 4, 0, 6, 8, 10, 12, 14, 2, 4, //
4959  2, 6, 8, 10, 12, 14, 0, 4, 0, 2, 6, 8, 10, 12, 14, 4, //
4960  4, 6, 8, 10, 12, 14, 0, 2, 0, 4, 6, 8, 10, 12, 14, 2, //
4961  2, 4, 6, 8, 10, 12, 14, 0, 0, 2, 4, 6, 8, 10, 12, 14};
4962 
4963  const Vec128<uint8_t, 2 * N> byte_idx = Load8Bytes(d8, table + mask_bits * 8);
4964  const Vec128<uint16_t, N> pairs = ZipLower(byte_idx, byte_idx);
4965  return BitCast(d, pairs + Set(du, 0x0100));
4966 }
4967 
4968 template <typename T, size_t N>
4970  const uint64_t mask_bits) {
4971  HWY_DASSERT(mask_bits < 16);
4972 
4973  // There are only 4 lanes, so we can afford to load the index vector directly.
4974  alignas(16) constexpr uint8_t packed_array[16 * 16] = {
4975  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, //
4976  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, //
4977  4, 5, 6, 7, 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, //
4978  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, //
4979  8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, //
4980  0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, //
4981  4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, //
4982  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, //
4983  12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, //
4984  0, 1, 2, 3, 12, 13, 14, 15, 4, 5, 6, 7, 8, 9, 10, 11, //
4985  4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 8, 9, 10, 11, //
4986  0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 8, 9, 10, 11, //
4987  8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, //
4988  0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, //
4989  4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, //
4990  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
4991  const Simd<T, N, 0> d;
4992  const Repartition<uint8_t, decltype(d)> d8;
4993  return BitCast(d, Load(d8, packed_array + 16 * mask_bits));
4994 }
4995 
4996 #if HWY_HAVE_INTEGER64 || HWY_HAVE_FLOAT64
4997 
4998 template <typename T, size_t N>
5000  const uint64_t mask_bits) {
5001  HWY_DASSERT(mask_bits < 4);
5002 
5003  // There are only 2 lanes, so we can afford to load the index vector directly.
5004  alignas(16) constexpr uint8_t packed_array[4 * 16] = {
5005  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
5006  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
5007  8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7,
5008  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
5009 
5010  const Simd<T, N, 0> d;
5011  const Repartition<uint8_t, decltype(d)> d8;
5012  return BitCast(d, Load(d8, packed_array + 16 * mask_bits));
5013 }
5014 
5015 #endif
5016 
5017 // Helper function called by both Compress and CompressStore - avoids a
5018 // redundant BitsFromMask in the latter.
5019 template <typename T, size_t N>
5020 HWY_INLINE Vec128<T, N> Compress(Vec128<T, N> v, const uint64_t mask_bits) {
5021  const auto idx =
5022  detail::IdxFromBits<T, N>(hwy::SizeTag<sizeof(T)>(), mask_bits);
5023  using D = Simd<T, N, 0>;
5024  const RebindToSigned<D> di;
5025  return BitCast(D(), TableLookupBytes(BitCast(di, v), BitCast(di, idx)));
5026 }
5027 
5028 } // namespace detail
5029 
5030 template <typename T, size_t N>
5032  return detail::Compress(v, detail::BitsFromMask(mask));
5033 }
5034 
5035 // ------------------------------ CompressBits
5036 
5037 template <typename T, size_t N>
5039  const uint8_t* HWY_RESTRICT bits) {
5040  uint64_t mask_bits = 0;
5041  constexpr size_t kNumBytes = (N + 7) / 8;
5042  CopyBytes<kNumBytes>(bits, &mask_bits);
5043  if (N < 8) {
5044  mask_bits &= (1ull << N) - 1;
5045  }
5046 
5047  return detail::Compress(v, mask_bits);
5048 }
5049 
5050 // ------------------------------ CompressStore
5051 template <typename T, size_t N>
5053  Simd<T, N, 0> d, T* HWY_RESTRICT unaligned) {
5054  const uint64_t mask_bits = detail::BitsFromMask(mask);
5055  StoreU(detail::Compress(v, mask_bits), d, unaligned);
5056  return PopCount(mask_bits);
5057 }
5058 
5059 // ------------------------------ CompressBlendedStore
5060 template <typename T, size_t N>
5062  Simd<T, N, 0> d,
5063  T* HWY_RESTRICT unaligned) {
5064  const RebindToUnsigned<decltype(d)> du; // so we can support fp16/bf16
5065  using TU = TFromD<decltype(du)>;
5066  const uint64_t mask_bits = detail::BitsFromMask(m);
5067  const size_t count = PopCount(mask_bits);
5068  const Mask128<T, N> store_mask = RebindMask(d, FirstN(du, count));
5069  const Vec128<TU, N> compressed = detail::Compress(BitCast(du, v), mask_bits);
5070  BlendedStore(BitCast(d, compressed), store_mask, d, unaligned);
5071  return count;
5072 }
5073 
5074 // ------------------------------ CompressBitsStore
5075 
5076 template <typename T, size_t N>
5078  const uint8_t* HWY_RESTRICT bits,
5079  Simd<T, N, 0> d, T* HWY_RESTRICT unaligned) {
5080  uint64_t mask_bits = 0;
5081  constexpr size_t kNumBytes = (N + 7) / 8;
5082  CopyBytes<kNumBytes>(bits, &mask_bits);
5083  if (N < 8) {
5084  mask_bits &= (1ull << N) - 1;
5085  }
5086 
5087  StoreU(detail::Compress(v, mask_bits), d, unaligned);
5088  return PopCount(mask_bits);
5089 }
5090 
5091 // ------------------------------ StoreInterleaved3
5092 
5093 // 128 bits
5095  const Vec128<uint8_t> v1,
5096  const Vec128<uint8_t> v2,
5097  Full128<uint8_t> /*tag*/,
5098  uint8_t* HWY_RESTRICT unaligned) {
5099  const uint8x16x3_t triple = {{v0.raw, v1.raw, v2.raw}};
5100  vst3q_u8(unaligned, triple);
5101 }
5102 
5103 // 64 bits
5105  const Vec64<uint8_t> v2, Full64<uint8_t> /*tag*/,
5106  uint8_t* HWY_RESTRICT unaligned) {
5107  const uint8x8x3_t triple = {{v0.raw, v1.raw, v2.raw}};
5108  vst3_u8(unaligned, triple);
5109 }
5110 
5111 // <= 32 bits: avoid writing more than N bytes by copying to buffer
5112 template <size_t N, HWY_IF_LE32(uint8_t, N)>
5114  const Vec128<uint8_t, N> v1,
5115  const Vec128<uint8_t, N> v2,
5116  Simd<uint8_t, N, 0> /*tag*/,
5117  uint8_t* HWY_RESTRICT unaligned) {
5118  alignas(16) uint8_t buf[24];
5119  const uint8x8x3_t triple = {{v0.raw, v1.raw, v2.raw}};
5120  vst3_u8(buf, triple);
5121  CopyBytes<N * 3>(buf, unaligned);
5122 }
5123 
5124 // ------------------------------ StoreInterleaved4
5125 
5126 // 128 bits
5128  const Vec128<uint8_t> v1,
5129  const Vec128<uint8_t> v2,
5130  const Vec128<uint8_t> v3,
5131  Full128<uint8_t> /*tag*/,
5132  uint8_t* HWY_RESTRICT unaligned) {
5133  const uint8x16x4_t quad = {{v0.raw, v1.raw, v2.raw, v3.raw}};
5134  vst4q_u8(unaligned, quad);
5135 }
5136 
5137 // 64 bits
5139  const Vec64<uint8_t> v2, const Vec64<uint8_t> v3,
5140  Full64<uint8_t> /*tag*/,
5141  uint8_t* HWY_RESTRICT unaligned) {
5142  const uint8x8x4_t quad = {{v0.raw, v1.raw, v2.raw, v3.raw}};
5143  vst4_u8(unaligned, quad);
5144 }
5145 
5146 // <= 32 bits: avoid writing more than N bytes by copying to buffer
5147 template <size_t N, HWY_IF_LE32(uint8_t, N)>
5149  const Vec128<uint8_t, N> v1,
5150  const Vec128<uint8_t, N> v2,
5151  const Vec128<uint8_t, N> v3,
5152  Simd<uint8_t, N, 0> /*tag*/,
5153  uint8_t* HWY_RESTRICT unaligned) {
5154  alignas(16) uint8_t buf[32];
5155  const uint8x8x4_t quad = {{v0.raw, v1.raw, v2.raw, v3.raw}};
5156  vst4_u8(buf, quad);
5157  CopyBytes<N * 4>(buf, unaligned);
5158 }
5159 
5160 // ------------------------------ Lt128
5161 
5162 namespace detail {
5163 
5164 template <size_t kLanes, typename T, size_t N>
5166  return MaskFromVec(ShiftLeftLanes<kLanes>(VecFromMask(Simd<T, N, 0>(), m)));
5167 }
5168 
5169 } // namespace detail
5170 
5171 template <typename T, size_t N, HWY_IF_LE128(T, N)>
5173  Vec128<T, N> b) {
5174  static_assert(!IsSigned<T>() && sizeof(T) == 8, "Use u64");
5175  // Truth table of Eq and Lt for Hi and Lo u64.
5176  // (removed lines with (=H && cH) or (=L && cL) - cannot both be true)
5177  // =H =L cH cL | out = cH | (=H & cL)
5178  // 0 0 0 0 | 0
5179  // 0 0 0 1 | 0
5180  // 0 0 1 0 | 1
5181  // 0 0 1 1 | 1
5182  // 0 1 0 0 | 0
5183  // 0 1 0 1 | 0
5184  // 0 1 1 0 | 1
5185  // 1 0 0 0 | 0
5186  // 1 0 0 1 | 1
5187  // 1 1 0 0 | 0
5188  const Mask128<T, N> eqHL = Eq(a, b);
5189  const Mask128<T, N> ltHL = Lt(a, b);
5190  // We need to bring cL to the upper lane/bit corresponding to cH. Comparing
5191  // the result of InterleaveUpper/Lower requires 9 ops, whereas shifting the
5192  // comparison result leftwards requires only 4.
5193  const Mask128<T, N> ltLx = detail::ShiftMaskLeft<1>(ltHL);
5194  const Mask128<T, N> outHx = Or(ltHL, And(eqHL, ltLx));
5195  const Vec128<T, N> vecHx = VecFromMask(d, outHx);
5196  return MaskFromVec(InterleaveUpper(d, vecHx, vecHx));
5197 }
5198 
5199 // ------------------------------ Min128, Max128 (Lt128)
5200 
5201 // Without a native OddEven, it seems infeasible to go faster than Lt128.
5202 template <class D>
5204  return IfThenElse(Lt128(d, a, b), a, b);
5205 }
5206 
5207 template <class D>
5209  return IfThenElse(Lt128(d, a, b), b, a);
5210 }
5211 
5212 // ================================================== Operator wrapper
5213 
5214 // These apply to all x86_*-inl.h because there are no restrictions on V.
5215 
5216 template <class V>
5217 HWY_API V Add(V a, V b) {
5218  return a + b;
5219 }
5220 template <class V>
5221 HWY_API V Sub(V a, V b) {
5222  return a - b;
5223 }
5224 
5225 template <class V>
5226 HWY_API V Mul(V a, V b) {
5227  return a * b;
5228 }
5229 template <class V>
5230 HWY_API V Div(V a, V b) {
5231  return a / b;
5232 }
5233 
5234 template <class V>
5235 V Shl(V a, V b) {
5236  return a << b;
5237 }
5238 template <class V>
5239 V Shr(V a, V b) {
5240  return a >> b;
5241 }
5242 
5243 template <class V>
5244 HWY_API auto Eq(V a, V b) -> decltype(a == b) {
5245  return a == b;
5246 }
5247 template <class V>
5248 HWY_API auto Ne(V a, V b) -> decltype(a == b) {
5249  return a != b;
5250 }
5251 template <class V>
5252 HWY_API auto Lt(V a, V b) -> decltype(a == b) {
5253  return a < b;
5254 }
5255 
5256 template <class V>
5257 HWY_API auto Gt(V a, V b) -> decltype(a == b) {
5258  return a > b;
5259 }
5260 template <class V>
5261 HWY_API auto Ge(V a, V b) -> decltype(a == b) {
5262  return a >= b;
5263 }
5264 
5265 template <class V>
5266 HWY_API auto Le(V a, V b) -> decltype(a == b) {
5267  return a <= b;
5268 }
5269 
5270 namespace detail { // for code folding
5271 #if HWY_ARCH_ARM_V7
5272 #undef vuzp1_s8
5273 #undef vuzp1_u8
5274 #undef vuzp1_s16
5275 #undef vuzp1_u16
5276 #undef vuzp1_s32
5277 #undef vuzp1_u32
5278 #undef vuzp1_f32
5279 #undef vuzp1q_s8
5280 #undef vuzp1q_u8
5281 #undef vuzp1q_s16
5282 #undef vuzp1q_u16
5283 #undef vuzp1q_s32
5284 #undef vuzp1q_u32
5285 #undef vuzp1q_f32
5286 #undef vuzp2_s8
5287 #undef vuzp2_u8
5288 #undef vuzp2_s16
5289 #undef vuzp2_u16
5290 #undef vuzp2_s32
5291 #undef vuzp2_u32
5292 #undef vuzp2_f32
5293 #undef vuzp2q_s8
5294 #undef vuzp2q_u8
5295 #undef vuzp2q_s16
5296 #undef vuzp2q_u16
5297 #undef vuzp2q_s32
5298 #undef vuzp2q_u32
5299 #undef vuzp2q_f32
5300 #undef vzip1_s8
5301 #undef vzip1_u8
5302 #undef vzip1_s16
5303 #undef vzip1_u16
5304 #undef vzip1_s32
5305 #undef vzip1_u32
5306 #undef vzip1_f32
5307 #undef vzip1q_s8
5308 #undef vzip1q_u8
5309 #undef vzip1q_s16
5310 #undef vzip1q_u16
5311 #undef vzip1q_s32
5312 #undef vzip1q_u32
5313 #undef vzip1q_f32
5314 #undef vzip2_s8
5315 #undef vzip2_u8
5316 #undef vzip2_s16
5317 #undef vzip2_u16
5318 #undef vzip2_s32
5319 #undef vzip2_u32
5320 #undef vzip2_f32
5321 #undef vzip2q_s8
5322 #undef vzip2q_u8
5323 #undef vzip2q_s16
5324 #undef vzip2q_u16
5325 #undef vzip2q_s32
5326 #undef vzip2q_u32
5327 #undef vzip2q_f32
5328 #endif
5329 
5330 #undef HWY_NEON_BUILD_ARG_1
5331 #undef HWY_NEON_BUILD_ARG_2
5332 #undef HWY_NEON_BUILD_ARG_3
5333 #undef HWY_NEON_BUILD_PARAM_1
5334 #undef HWY_NEON_BUILD_PARAM_2
5335 #undef HWY_NEON_BUILD_PARAM_3
5336 #undef HWY_NEON_BUILD_RET_1
5337 #undef HWY_NEON_BUILD_RET_2
5338 #undef HWY_NEON_BUILD_RET_3
5339 #undef HWY_NEON_BUILD_TPL_1
5340 #undef HWY_NEON_BUILD_TPL_2
5341 #undef HWY_NEON_BUILD_TPL_3
5342 #undef HWY_NEON_DEF_FUNCTION
5343 #undef HWY_NEON_DEF_FUNCTION_ALL_FLOATS
5344 #undef HWY_NEON_DEF_FUNCTION_ALL_TYPES
5345 #undef HWY_NEON_DEF_FUNCTION_INT_8
5346 #undef HWY_NEON_DEF_FUNCTION_INT_16
5347 #undef HWY_NEON_DEF_FUNCTION_INT_32
5348 #undef HWY_NEON_DEF_FUNCTION_INT_8_16_32
5349 #undef HWY_NEON_DEF_FUNCTION_INTS
5350 #undef HWY_NEON_DEF_FUNCTION_INTS_UINTS
5351 #undef HWY_NEON_DEF_FUNCTION_TPL
5352 #undef HWY_NEON_DEF_FUNCTION_UINT_8
5353 #undef HWY_NEON_DEF_FUNCTION_UINT_16
5354 #undef HWY_NEON_DEF_FUNCTION_UINT_32
5355 #undef HWY_NEON_DEF_FUNCTION_UINT_8_16_32
5356 #undef HWY_NEON_DEF_FUNCTION_UINTS
5357 #undef HWY_NEON_EVAL
5358 } // namespace detail
5359 
5360 // NOLINTNEXTLINE(google-readability-namespace-comments)
5361 } // namespace HWY_NAMESPACE
5362 } // namespace hwy
HWY_AFTER_NAMESPACE()
#define HWY_NEON_DEF_FUNCTION_ALL_FLOATS(name, prefix, infix, args)
Definition: arm_neon-inl.h:168
#define HWY_NEON_DEF_FUNCTION_INTS(name, prefix, infix, args)
Definition: arm_neon-inl.h:191
#define HWY_NEON_DEF_FUNCTION_ALL_TYPES(name, prefix, infix, args)
Definition: arm_neon-inl.h:201
#define HWY_NEON_DEF_FUNCTION_INT_64(name, prefix, infix, args)
Definition: arm_neon-inl.h:147
#define HWY_NEON_DEF_FUNCTION_UINT_64(name, prefix, infix, args)
Definition: arm_neon-inl.h:142
#define HWY_NEON_DEF_FUNCTION_UINT_8(name, prefix, infix, args)
Definition: arm_neon-inl.h:100
#define HWY_NEON_DEF_FUNCTION_INTS_UINTS(name, prefix, infix, args)
Definition: arm_neon-inl.h:196
#define HWY_NEON_DEF_FUNCTION_UINT_32(name, prefix, infix, args)
Definition: arm_neon-inl.h:130
HWY_BEFORE_NAMESPACE()
#define HWY_NEON_DEF_FUNCTION_UINT_8_16_32(name, prefix, infix, args)
Definition: arm_neon-inl.h:174
#define HWY_NEON_DEF_FUNCTION(type, size, name, prefix, infix, suffix, args)
Definition: arm_neon-inl.h:917
#define HWY_NEON_DEF_FUNCTION_UINT_16(name, prefix, infix, args)
Definition: arm_neon-inl.h:116
#define HWY_NEON_DEF_FUNCTION_INT_16(name, prefix, infix, args)
Definition: arm_neon-inl.h:123
#define HWY_NEON_DEF_FUNCTION_UINTS(name, prefix, infix, args)
Definition: arm_neon-inl.h:186
#define HWY_NEON_DEF_FUNCTION_INT_8(name, prefix, infix, args)
Definition: arm_neon-inl.h:108
#define HWY_NEON_DEF_FUNCTION_INT_32(name, prefix, infix, args)
Definition: arm_neon-inl.h:136
#define HWY_NEON_DEF_FUNCTION_FLOAT_32(name, prefix, infix, args)
Definition: arm_neon-inl.h:152
#define HWY_IF_FLOAT(T)
Definition: base.h:307
#define HWY_RESTRICT
Definition: base.h:63
#define HWY_DIAGNOSTICS(tokens)
Definition: base.h:71
#define HWY_IF_LE64(T, N)
Definition: base.h:297
#define HWY_API
Definition: base.h:122
#define HWY_MIN(a, b)
Definition: base.h:127
#define HWY_INLINE
Definition: base.h:64
#define HWY_DIAGNOSTICS_OFF(msc, gcc)
Definition: base.h:72
#define HWY_DASSERT(condition)
Definition: base.h:193
#define HWY_ASSERT(condition)
Definition: base.h:147
Definition: arm_neon-inl.h:529
HWY_INLINE Mask128()
Definition: arm_neon-inl.h:534
Mask128(const Mask128 &)=default
HWY_INLINE Mask128(const Raw raw)
Definition: arm_neon-inl.h:537
Raw raw
Definition: arm_neon-inl.h:539
typename detail::Raw128< MakeUnsigned< T >, N >::type Raw
Definition: arm_neon-inl.h:531
Mask128 & operator=(const Mask128 &)=default
Definition: arm_neon-inl.h:485
HWY_INLINE Vec128 & operator&=(const Vec128 other)
Definition: arm_neon-inl.h:508
HWY_INLINE Vec128 & operator|=(const Vec128 other)
Definition: arm_neon-inl.h:511
HWY_INLINE Vec128()
Definition: arm_neon-inl.h:489
HWY_INLINE Vec128 & operator/=(const Vec128 other)
Definition: arm_neon-inl.h:499
HWY_INLINE Vec128(const Raw raw)
Definition: arm_neon-inl.h:492
HWY_INLINE Vec128 & operator^=(const Vec128 other)
Definition: arm_neon-inl.h:514
HWY_INLINE Vec128 & operator*=(const Vec128 other)
Definition: arm_neon-inl.h:496
Vec128(const Vec128 &)=default
typename detail::Raw128< T, N >::type Raw
Definition: arm_neon-inl.h:486
Raw raw
Definition: arm_neon-inl.h:518
HWY_INLINE Vec128 & operator+=(const Vec128 other)
Definition: arm_neon-inl.h:502
Vec128 & operator=(const Vec128 &)=default
HWY_INLINE Vec128 & operator-=(const Vec128 other)
Definition: arm_neon-inl.h:505
HWY_INLINE Vec128< T > PopulationCount(hwy::SizeTag< 1 >, Vec128< T > v)
Definition: arm_neon-inl.h:1538
HWY_INLINE Mask128< float, N > UseInt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2860
HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag< 1 >, const Mask128< T > mask)
Definition: arm_neon-inl.h:4522
HWY_INLINE Mask128< T, N > Or(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:899
HWY_INLINE Vec128< T, N > Set64(Simd< T, N, 0 >, uint64_t mask_bits)
Definition: arm_neon-inl.h:4457
HWY_INLINE Mask128< T, N > MaskFromVec(hwy::SizeTag< 1 >, const Vec128< T, N > v)
Definition: x86_128-inl.h:1357
HWY_INLINE Vec128< float > ReciprocalSqrtStep(const Vec128< float > root, const Vec128< float > recip)
Definition: arm_neon-inl.h:1390
HWY_INLINE Mask128< T, N > TestBit(hwy::SizeTag< 1 >, const Vec128< T, N > v, const Vec128< T, N > bit)
Definition: x86_128-inl.h:1193
HWY_INLINE Vec128< T, 1 > MinOfLanes(hwy::SizeTag< sizeof(T)>, const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:4309
HWY_INLINE Vec128< uint8_t > Load8Bytes(Full128< uint8_t >, const uint8_t *bytes)
Definition: arm_neon-inl.h:4804
HWY_NEON_DEF_FUNCTION_ALL_FLOATS(BitCastToByte, vreinterpret, _u8_, HWY_CAST_TO_U8) template< size_t N > HWY_INLINE Vec128< uint8_t
HWY_INLINE Vec128< float > ReciprocalNewtonRaphsonStep(const Vec128< float > recip, const Vec128< float > divisor)
Definition: arm_neon-inl.h:1239
HWY_INLINE size_t CountTrue(hwy::SizeTag< 1 >, const Mask128< T > mask)
Definition: arm_neon-inl.h:4680
HWY_INLINE Vec128< T, N > Compress(Vec128< T, N > v, const uint64_t mask_bits)
Definition: arm_neon-inl.h:5020
HWY_INLINE Vec128< T, N > IfThenElseZero(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > yes)
Definition: x86_128-inl.h:728
HWY_INLINE Mask128< T, N > And(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:825
HWY_INLINE Vec128< T, 1 > MaxOfLanes(hwy::SizeTag< sizeof(T)>, const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:4314
constexpr uint64_t OnlyActive(uint64_t bits)
Definition: arm_neon-inl.h:4664
HWY_INLINE Vec128< uint8_t, N > BitCastFromByte(Simd< uint8_t, N, 0 >, Vec128< uint8_t, N > v)
Definition: arm_neon-inl.h:601
HWY_INLINE Vec128< uint8_t, N > BitCastToByte(Vec128< uint8_t, N > v)
Definition: arm_neon-inl.h:574
HWY_API Vec128< uint64_t > InterleaveUpper(const Vec128< uint64_t > a, const Vec128< uint64_t > b)
Definition: arm_neon-inl.h:3635
HWY_INLINE Mask128< T, N > LoadMaskBits(Simd< T, N, 0 > d, uint64_t mask_bits)
Definition: arm_neon-inl.h:4467
Mask128< T, N > ShiftMaskLeft(Mask128< T, N > m)
Definition: arm_neon-inl.h:5165
HWY_INLINE Vec128< T, N > IdxFromBits(hwy::SizeTag< 2 >, const uint64_t mask_bits)
Definition: arm_neon-inl.h:4818
HWY_INLINE Vec128< T, 1 > SumOfLanes(const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:4305
HWY_API Mask1< T > operator<=(const Vec1< T > a, const Vec1< T > b)
Definition: scalar-inl.h:842
d
Definition: rvv-inl.h:1656
HWY_API Vec128< T, N > CopySign(const Vec128< T, N > magn, const Vec128< T, N > sign)
Definition: arm_neon-inl.h:1648
HWY_API Vec128< T, N > OddEvenBlocks(Vec128< T, N >, Vec128< T, N > even)
Definition: arm_neon-inl.h:4038
HWY_API Vec< D > SignBit(D d)
Definition: generic_ops-inl.h:61
HWY_API Vec128< uint8_t > Combine(Full128< uint8_t >, Vec64< uint8_t > hi, Vec64< uint8_t > lo)
Definition: arm_neon-inl.h:3709
HWY_API Mask128< TTo, N > RebindMask(Simd< TTo, N, 0 > dto, Mask128< TFrom, N > m)
Definition: arm_neon-inl.h:1688
HWY_API Vec128< T, N > DupOdd(Vec128< T, N > v)
Definition: arm_neon-inl.h:4003
HWY_API Vec128< float > ApproximateReciprocal(const Vec128< float > v)
Definition: arm_neon-inl.h:1225
HWY_API VFromD< DW > ZipLower(V a, V b)
Definition: arm_neon-inl.h:3672
HWY_API uint8_t GetLane(const Vec128< uint8_t, 16 > v)
Definition: arm_neon-inl.h:767
HWY_API Vec128< T, N > PopulationCount(Vec128< T, N > v)
Definition: arm_neon-inl.h:1595
HWY_API Vec128< T > Shuffle1032(const Vec128< T > v)
Definition: arm_neon-inl.h:3531
HWY_NEON_DEF_FUNCTION_INT_8_16_32(Neg, vneg, _, 1) HWY_API Vec64< int64_t > Neg(const Vec64< int64_t > v)
Definition: arm_neon-inl.h:894
HWY_API auto Lt(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:5252
HWY_API Vec128< uint64_t > InterleaveLower(const Vec128< uint64_t > a, const Vec128< uint64_t > b)
Definition: arm_neon-inl.h:3581
HWY_API Vec128< T > Shuffle2103(const Vec128< T > v)
Definition: arm_neon-inl.h:3547
HWY_API Vec128< float, N > Round(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2878
HWY_API Vec128< T, N > ZeroExtendVector(Simd< T, N, 0 > d, Vec128< T, N/2 > lo)
Definition: arm_neon-inl.h:3769
HWY_API auto Eq(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:5244
HWY_API Vec128< T, N > ShiftLeftSame(const Vec128< T, N > v, int bits)
Definition: arm_neon-inl.h:1122
HWY_API V128 CombineShiftRightBytes(Full128< T > d, V128 hi, V128 lo)
Definition: arm_neon-inl.h:2999
HWY_API auto Gt(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:5257
HWY_API Vec128< T, N > ShiftLeftLanes(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3102
_
Definition: rvv-inl.h:1405
HWY_API Mask128< T, N > FirstN(const Simd< T, N, 0 > d, size_t num)
Definition: arm_neon-inl.h:1896
HWY_API size_t StoreMaskBits(Simd< T, N, 0 >, const Mask128< T, N > mask, uint8_t *bits)
Definition: arm_neon-inl.h:4761
HWY_API Vec128< float, N > MulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition: arm_neon-inl.h:1290
HWY_API void Stream(const Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2416
HWY_API Vec128< int8_t > Abs(const Vec128< int8_t > v)
Definition: arm_neon-inl.h:1604
Repartition< MakeWide< TFromD< D > >, D > RepartitionToWide
Definition: ops/shared-inl.h:210
HWY_API bool AllTrue(const Simd< T, N, 0 > d, const Mask128< T, N > m)
Definition: arm_neon-inl.h:4790
HWY_API Vec128< T, N > SumOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4437
HWY_API Vec128< T, N > BroadcastSignBit(const Vec128< T, N > v)
Definition: arm_neon-inl.h:1665
V Shl(V a, V b)
Definition: arm_neon-inl.h:5235
HWY_API auto Ge(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:5261
HWY_API Vec128< float > AbsDiff(const Vec128< float > a, const Vec128< float > b)
Definition: arm_neon-inl.h:1264
StoreInterleaved3
Definition: rvv-inl.h:1405
HWY_API Vec128< uint32_t > ConcatOdd(Full128< uint32_t >, Vec128< uint32_t > hi, Vec128< uint32_t > lo)
Definition: arm_neon-inl.h:3903
HWY_API Vec128< uint64_t, N > Min(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:1957
HWY_API Vec256< uint64_t > CLMulLower(Vec256< uint64_t > a, Vec256< uint64_t > b)
Definition: x86_256-inl.h:3972
HWY_API Vec128< uint64_t, N > Max(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:1995
HWY_API Mask128< T, N > MaskFromVec(const Vec128< T, N > v)
Definition: arm_neon-inl.h:1675
HWY_API Vec128< T, N > ConcatUpperUpper(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:3842
Rebind< MakeUnsigned< TFromD< D > >, D > RebindToUnsigned
Definition: ops/shared-inl.h:201
HWY_API Vec128< T, N > GatherIndex(const Simd< T, N, 0 > d, const T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition: arm_neon-inl.h:4284
HWY_API Vec128< uint8_t > operator<<(const Vec128< uint8_t > v, const Vec128< uint8_t > bits)
Definition: arm_neon-inl.h:953
HWY_INLINE Vec128< uint64_t > MulOdd(Vec128< uint64_t > a, Vec128< uint64_t > b)
Definition: arm_neon-inl.h:4159
HWY_API Mask128< T, N > operator>(Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:1885
HWY_API Vec1< uint8_t > SaturatedAdd(const Vec1< uint8_t > a, const Vec1< uint8_t > b)
Definition: scalar-inl.h:457
sseg3 sseg3 StoreInterleaved4
Definition: rvv-inl.h:1428
HWY_API Vec128< T > Shuffle0321(const Vec128< T > v)
Definition: arm_neon-inl.h:3541
HWY_API Vec128< uint32_t > ConcatEven(Full128< uint32_t >, Vec128< uint32_t > hi, Vec128< uint32_t > lo)
Definition: arm_neon-inl.h:3945
HWY_API Vec128< T, N > Load(Simd< T, N, 0 > d, const T *HWY_RESTRICT p)
Definition: arm_neon-inl.h:2205
HWY_API Vec128< T, N > ConcatLowerUpper(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:3869
HWY_API Vec128< int64_t > Neg(const Vec128< int64_t > v)
Definition: arm_neon-inl.h:904
HWY_API Vec128< T, N > Zero(Simd< T, N, 0 > d)
Definition: arm_neon-inl.h:733
HWY_API Vec256< uint8_t > AESRound(Vec256< uint8_t > state, Vec256< uint8_t > round_key)
Definition: x86_256-inl.h:3947
HWY_API Vec128< T, N > operator&(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1513
HWY_API Vec128< T, N > operator|(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1518
HWY_API Vec128< int64_t > MulEven(Vec128< int32_t > a, Vec128< int32_t > b)
Definition: arm_neon-inl.h:4119
HWY_API size_t Lanes(Simd< T, N, kPow2 > d)
Definition: arm_sve-inl.h:218
HWY_API Vec128< bfloat16_t, 2 *N > ReorderDemote2To(Simd< bfloat16_t, 2 *N, 0 > dbf16, Vec128< float, N > a, Vec128< float, N > b)
Definition: arm_neon-inl.h:4060
HWY_API Vec128< T, N > MaskedLoad(Mask128< T, N > m, Simd< T, N, 0 > d, const T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2210
HWY_API Vec128< T, N/2 > LowerHalf(const Vec128< T, N > v)
Definition: arm_neon-inl.h:2952
Rebind< MakeSigned< TFromD< D > >, D > RebindToSigned
Definition: ops/shared-inl.h:199
HWY_API Mask1< T > operator==(const Vec1< T > a, const Vec1< T > b)
Definition: scalar-inl.h:817
HWY_API Vec32< uint8_t > U8FromU32(const Vec128< uint32_t > v)
Definition: arm_neon-inl.h:2748
HWY_API Vec128< float, N > ReorderWidenMulAccumulate(Simd< float, N, 0 > df32, Vec128< bfloat16_t, 2 *N > a, Vec128< bfloat16_t, 2 *N > b, const Vec128< float, N > sum0, Vec128< float, N > &sum1)
Definition: arm_neon-inl.h:3688
HWY_API Vec128< T, N > IfVecThenElse(Vec128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition: arm_neon-inl.h:1505
HWY_API Vec128< T, N > operator^(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1523
HWY_API void BlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT p)
Definition: arm_neon-inl.h:2402
HWY_API size_t CountTrue(Full128< T >, const Mask128< T > mask)
Definition: arm_neon-inl.h:4742
HWY_API Vec128< T, N > VecFromMask(Simd< T, N, 0 > d, const Mask128< T, N > v)
Definition: arm_neon-inl.h:1681
HWY_API intptr_t FindFirstTrue(const Simd< T, N, 0 >, const Mask128< T, N > mask)
Definition: arm_neon-inl.h:4753
HWY_API Vec128< float > ConvertTo(Full128< float >, const Vec128< int32_t > v)
Definition: arm_neon-inl.h:2788
HWY_API Vec128< T, N > DupEven(Vec128< T, N > v)
Definition: arm_neon-inl.h:3987
HWY_API Vec128< T, N > IfThenElseZero(const Mask128< T, N > mask, const Vec128< T, N > yes)
Definition: arm_neon-inl.h:1711
HWY_API V Add(V a, V b)
Definition: arm_neon-inl.h:5217
HWY_API bool AllFalse(const Full128< T > d, const Mask128< T > m)
Definition: arm_neon-inl.h:4771
HWY_API Vec128< T, N > TableLookupLanes(Vec128< T, N > v, Indices128< T, N > idx)
Definition: arm_neon-inl.h:3419
HWY_API Vec128< T, N > Reverse4(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3490
HWY_API Vec128< float, N > Floor(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2909
HWY_API Vec128< float, N > MulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition: arm_neon-inl.h:1344
HWY_API Vec128< T, N > CopySignToAbs(const Vec128< T, N > abs, const Vec128< T, N > sign)
Definition: arm_neon-inl.h:1656
HWY_API Vec128< T, N > ZeroIfNegative(Vec128< T, N > v)
Definition: arm_neon-inl.h:1735
HWY_API void StoreU(const Vec128< uint8_t > v, Full128< uint8_t >, uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:2224
HWY_API Mask128< T, N > operator!=(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1815
HWY_API Vec128< float, N > Ceil(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2895
HWY_API Indices128< T, N > IndicesFromVec(Simd< T, N, 0 > d, Vec128< TI, N > vec)
Definition: arm_neon-inl.h:3373
HWY_API Vec128< T, N > SwapAdjacentBlocks(Vec128< T, N > v)
Definition: arm_neon-inl.h:4045
HWY_API Vec128< T, N > ShiftLeftBytes(Simd< T, N, 0 >, Vec128< T, N > v)
Definition: arm_neon-inl.h:3091
HWY_INLINE VFromD< D > Min128(D d, const VFromD< D > a, const VFromD< D > b)
Definition: arm_neon-inl.h:5203
HWY_API Vec128< T, N > Reverse2(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3461
svuint16_t Set(Simd< bfloat16_t, N, kPow2 > d, bfloat16_t arg)
Definition: arm_sve-inl.h:282
HWY_API Vec128< T, N > Reverse8(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3513
HWY_API Vec128< T, N > MaxOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4445
HWY_API Mask1< T > operator<(const Vec1< T > a, const Vec1< T > b)
Definition: scalar-inl.h:833
Vec128< T, N > Iota(const Simd< T, N, 0 > d, const T2 first)
Definition: arm_neon-inl.h:757
HWY_API Vec1< uint8_t > AverageRound(const Vec1< uint8_t > a, const Vec1< uint8_t > b)
Definition: scalar-inl.h:510
HWY_API Vec1< T > ShiftRight(const Vec1< T > v)
Definition: scalar-inl.h:345
HWY_API Mask128< T, N > LoadMaskBits(Simd< T, N, 0 > d, const uint8_t *HWY_RESTRICT bits)
Definition: arm_neon-inl.h:4510
HWY_API Vec256< uint64_t > CLMulUpper(Vec256< uint64_t > a, Vec256< uint64_t > b)
Definition: x86_256-inl.h:3983
HWY_API Vec128< T > Shuffle01(const Vec128< T > v)
Definition: arm_neon-inl.h:3535
HWY_API Mask128< uint64_t, N > TestBit(Vec128< uint64_t, N > v, Vec128< uint64_t, N > bit)
Definition: arm_neon-inl.h:1917
HWY_API Vec256< uint8_t > AESLastRound(Vec256< uint8_t > state, Vec256< uint8_t > round_key)
Definition: x86_256-inl.h:3959
HWY_API Vec128< uint8_t > LoadU(Full128< uint8_t >, const uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:2031
HWY_API Vec128< int16_t > MulHigh(const Vec128< int16_t > a, const Vec128< int16_t > b)
Definition: arm_neon-inl.h:1175
HWY_API Vec1< uint8_t > SaturatedSub(const Vec1< uint8_t > a, const Vec1< uint8_t > b)
Definition: scalar-inl.h:484
HWY_INLINE Mask128< T, N > Lt128(Simd< T, N, 0 > d, Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:5172
HWY_API Vec128< float, N > operator/(const Vec128< float, N > a, const Vec128< float, N > b)
Definition: arm_neon-inl.h:1252
HWY_API Vec128< T, N > OrAnd(Vec128< T, N > o, Vec128< T, N > a1, Vec128< T, N > a2)
Definition: arm_neon-inl.h:1498
HWY_API Vec128< T, N > IfNegativeThenElse(Vec128< T, N > v, Vec128< T, N > yes, Vec128< T, N > no)
Definition: arm_neon-inl.h:1724
HWY_API Vec128< T, N > ConcatUpperLower(Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:3895
HWY_API Vec128< T, N > And(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1440
HWY_API Vec128< T, N > BitCast(Simd< T, N, 0 > d, Vec128< FromT, N *sizeof(T)/sizeof(FromT)> v)
Definition: arm_neon-inl.h:710
HWY_API Vec128< int16_t > MulFixedPoint15(Vec128< int16_t > a, Vec128< int16_t > b)
Definition: arm_neon-inl.h:1211
HWY_API void ScatterOffset(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition: arm_neon-inl.h:4231
HWY_API VFromD< DW > ZipUpper(DW dw, V a, V b)
Definition: arm_neon-inl.h:3681
HWY_API Vec128< T, N > ConcatLowerLower(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:3777
HWY_API V Sub(V a, V b)
Definition: arm_neon-inl.h:5221
typename D::template Rebind< T > Rebind
Definition: ops/shared-inl.h:196
HWY_API Vec128< T, N > Undefined(Simd< T, N, 0 >)
Definition: arm_neon-inl.h:747
HWY_API Mask128< T, N > operator>=(Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:1889
HWY_API size_t CompressBitsStore(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:5077
HWY_API Vec128< uint16_t > operator*(const Vec128< uint16_t > a, const Vec128< uint16_t > b)
Definition: arm_neon-inl.h:1133
HWY_API V InterleaveUpper(Simd< T, N, 0 >, V a, V b)
Definition: arm_neon-inl.h:3656
HWY_API Vec128< T, N > GatherOffset(const Simd< T, N, 0 > d, const T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition: arm_neon-inl.h:4267
HWY_API size_t CompressBlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:5061
HWY_API Vec128< T, N > IfThenZeroElse(const Mask128< T, N > mask, const Vec128< T, N > no)
Definition: arm_neon-inl.h:1718
HWY_API Vec128< T, N > Xor(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1489
HWY_INLINE VFromD< D > Max128(D d, const VFromD< D > a, const VFromD< D > b)
Definition: arm_neon-inl.h:5208
HWY_API Vec1< T > ShiftLeft(const Vec1< T > v)
Definition: scalar-inl.h:339
HWY_API auto Le(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:5266
HWY_API Vec128< T, N > ShiftRightSame(const Vec128< T, N > v, int bits)
Definition: arm_neon-inl.h:1126
decltype(detail::DeduceD()(V())) DFromV
Definition: arm_neon-inl.h:555
HWY_API Vec128< int32_t, N > NearestInt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2939
HWY_API Indices128< T, N > SetTableIndices(Simd< T, N, 0 > d, const TI *idx)
Definition: arm_neon-inl.h:3413
HWY_API void ScatterIndex(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition: arm_neon-inl.h:4249
HWY_API Vec128< T > Not(const Vec128< T > v)
Definition: arm_neon-inl.h:1422
HWY_API Vec128< float, N > NegMulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition: arm_neon-inl.h:1323
HWY_API Vec64< uint32_t > Shuffle2301(const Vec64< uint32_t > v)
Definition: arm_neon-inl.h:1778
V Shr(V a, V b)
Definition: arm_neon-inl.h:5239
decltype(Zero(D())) VFromD
Definition: arm_neon-inl.h:743
HWY_API Vec128< T, N > LoadDup128(Simd< T, N, 0 > d, const T *const HWY_RESTRICT p)
Definition: arm_neon-inl.h:2217
HWY_API Vec128< T, N > OddEven(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:4019
HWY_API Vec128< uint16_t > Broadcast(const Vec128< uint16_t > v)
Definition: arm_neon-inl.h:3285
HWY_INLINE Vec128< T, N > CompressBits(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits)
Definition: arm_neon-inl.h:5038
HWY_API Vec128< T > Shuffle0123(const Vec128< T > v)
Definition: arm_neon-inl.h:3553
HWY_API Vec128< float, N > Trunc(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2867
typename D::Half Half
Definition: ops/shared-inl.h:216
HWY_API Vec128< T, N > MinOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4441
HWY_API Vec128< T, N > ShiftRightBytes(Simd< T, N, 0 >, Vec128< T, N > v)
Definition: arm_neon-inl.h:3114
typename D::template Repartition< T > Repartition
Definition: ops/shared-inl.h:207
HWY_API auto Ne(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:5248
N
Definition: rvv-inl.h:1656
HWY_API Vec128< float, N > Sqrt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:1404
HWY_API Vec64< uint16_t > DemoteTo(Full64< uint16_t >, const Vec128< int32_t > v)
Definition: arm_neon-inl.h:2606
HWY_API Vec128< TI > TableLookupBytes(const Vec128< T > bytes, const Vec128< TI > from)
Definition: arm_neon-inl.h:4169
HWY_API size_t CompressStore(Vec128< T, N > v, const Mask128< T, N > mask, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:5052
HWY_API Vec128< uint32_t, N > RotateRight(const Vec128< uint32_t, N > v)
Definition: arm_neon-inl.h:935
HWY_API Vec128< T, N > AndNot(const Vec128< T, N > not_mask, const Vec128< T, N > mask)
Definition: arm_neon-inl.h:1455
HWY_API Vec128< T > ReverseBlocks(Full128< T >, const Vec128< T > v)
Definition: arm_neon-inl.h:4053
HWY_API V Div(V a, V b)
Definition: arm_neon-inl.h:5230
HWY_API Vec128< uint8_t > operator>>(const Vec128< uint8_t > v, const Vec128< uint8_t > bits)
Definition: arm_neon-inl.h:1033
HWY_API Vec128< uint64_t > SumsOf8(const Vec128< uint8_t > v)
Definition: arm_neon-inl.h:852
HWY_API V Mul(V a, V b)
Definition: arm_neon-inl.h:5226
HWY_API Vec128< T, 1 > Reverse(Simd< T, 1, 0 >, const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:3430
HWY_API void Store(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2397
HWY_API Vec128< uint16_t > PromoteTo(Full128< uint16_t >, const Vec64< uint8_t > v)
Definition: arm_neon-inl.h:2426
TFromD< DFromV< V > > TFromV
Definition: arm_neon-inl.h:558
HWY_API Vec64< uint8_t > UpperHalf(Full64< uint8_t >, const Vec128< uint8_t > v)
Definition: arm_neon-inl.h:3146
HWY_API Vec128< float > ApproximateReciprocalSqrt(const Vec128< float > v)
Definition: arm_neon-inl.h:1376
const vfloat64m1_t v
Definition: rvv-inl.h:1656
HWY_API Vec128< T, N > Compress(Vec128< T, N > v, const Mask128< T, N > mask)
Definition: arm_neon-inl.h:5031
HWY_API Vec128< T, N > ShiftRightLanes(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3120
typename D::T TFromD
Definition: ops/shared-inl.h:192
HWY_API VI TableLookupBytesOr0(const V bytes, const VI from)
Definition: arm_neon-inl.h:4224
HWY_API Vec128< T, N > Or(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1477
HWY_API Vec128< float, N > NegMulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition: arm_neon-inl.h:1352
HWY_API Vec1< T > IfThenElse(const Mask1< T > mask, const Vec1< T > yes, const Vec1< T > no)
Definition: scalar-inl.h:278
Definition: aligned_allocator.h:27
HWY_API void CopyBytes(const From *from, To *to)
Definition: base.h:732
HWY_API size_t Num0BitsBelowLS1Bit_Nonzero64(const uint64_t x)
Definition: base.h:608
HWY_API uint64_t Mul128(uint64_t a, uint64_t b, uint64_t *HWY_RESTRICT upper)
Definition: base.h:711
typename detail::TypeFromSize< N >::Unsigned UnsignedFromSize
Definition: base.h:466
constexpr float MantissaEnd< float >()
Definition: base.h:575
HWY_API size_t PopCount(uint64_t x)
Definition: base.h:667
typename detail::Relations< T >::Unsigned MakeUnsigned
Definition: base.h:452
typename detail::Relations< T >::Signed MakeSigned
Definition: base.h:454
#define HWY_ALIGN
Definition: set_macros-inl.h:81
#define HWY_NAMESPACE
Definition: set_macros-inl.h:80
Definition: arm_neon-inl.h:4797
@ value
Definition: arm_neon-inl.h:4798
Definition: arm_neon-inl.h:3368
detail::Raw128< T, N >::type raw
Definition: arm_neon-inl.h:3369
Definition: ops/shared-inl.h:40
Definition: arm_neon-inl.h:545
Simd< T, N, 0 > operator()(Vec128< T, N >) const
Definition: arm_neon-inl.h:547
uint16x4_t type
Definition: arm_neon-inl.h:468
uint16x4_t type
Definition: arm_neon-inl.h:432
uint16x4_t type
Definition: arm_neon-inl.h:379
uint16x8_t type
Definition: arm_neon-inl.h:316
uint16x4_t type
Definition: arm_neon-inl.h:463
uint16x4_t type
Definition: arm_neon-inl.h:427
uint16x4_t type
Definition: arm_neon-inl.h:374
uint16x8_t type
Definition: arm_neon-inl.h:311
float32x2_t type
Definition: arm_neon-inl.h:437
float32x2_t type
Definition: arm_neon-inl.h:384
float32x4_t type
Definition: arm_neon-inl.h:321
int16x4_t type
Definition: arm_neon-inl.h:458
int16x4_t type
Definition: arm_neon-inl.h:417
int16x4_t type
Definition: arm_neon-inl.h:359
int16x8_t type
Definition: arm_neon-inl.h:296
int32x2_t type
Definition: arm_neon-inl.h:422
int32x2_t type
Definition: arm_neon-inl.h:364
int32x4_t type
Definition: arm_neon-inl.h:301
int64x1_t type
Definition: arm_neon-inl.h:369
int64x2_t type
Definition: arm_neon-inl.h:306
int8x16_t type
Definition: arm_neon-inl.h:291
int8x8_t type
Definition: arm_neon-inl.h:479
int8x8_t type
Definition: arm_neon-inl.h:453
int8x8_t type
Definition: arm_neon-inl.h:412
int8x8_t type
Definition: arm_neon-inl.h:354
uint16x4_t type
Definition: arm_neon-inl.h:448
uint16x4_t type
Definition: arm_neon-inl.h:402
uint16x4_t type
Definition: arm_neon-inl.h:339
uint16x8_t type
Definition: arm_neon-inl.h:276
uint32x2_t type
Definition: arm_neon-inl.h:407
uint32x2_t type
Definition: arm_neon-inl.h:344
uint32x4_t type
Definition: arm_neon-inl.h:281
uint64x1_t type
Definition: arm_neon-inl.h:349
uint64x2_t type
Definition: arm_neon-inl.h:286
uint8x16_t type
Definition: arm_neon-inl.h:271
uint8x8_t type
Definition: arm_neon-inl.h:474
uint8x8_t type
Definition: arm_neon-inl.h:443
uint8x8_t type
Definition: arm_neon-inl.h:397
uint8x8_t type
Definition: arm_neon-inl.h:334
Definition: x86_128-inl.h:69
__v128_u type
Definition: wasm_128-inl.h:62
HWY_INLINE Vec128< T, N > operator()(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3046
HWY_INLINE Vec128< T, N > operator()(const Vec128< T, N >)
Definition: arm_neon-inl.h:3053
Definition: arm_neon-inl.h:3024
HWY_INLINE Vec128< T > operator()(const Vec128< T > v)
Definition: arm_neon-inl.h:3027
HWY_INLINE Vec128< T, N > operator()(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3034
HWY_INLINE Vec128< T, N > operator()(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3076
HWY_INLINE Vec128< T, N > operator()(const Vec128< T, N >)
Definition: arm_neon-inl.h:3083
Definition: arm_neon-inl.h:3059
HWY_INLINE Vec128< T, N > operator()(Vec128< T, N > v)
Definition: arm_neon-inl.h:3061
Definition: base.h:317
Definition: base.h:253
Definition: base.h:248