Grok  9.7.5
scalar-inl.h
Go to the documentation of this file.
1 // Copyright 2019 Google LLC
2 // SPDX-License-Identifier: Apache-2.0
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 // http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 
16 // Single-element vectors and operations.
17 // External include guard in highway.h - see comment there.
18 
19 #include <stddef.h>
20 #include <stdint.h>
21 
22 #include "hwy/base.h"
23 #include "hwy/ops/shared-inl.h"
24 
26 namespace hwy {
27 namespace HWY_NAMESPACE {
28 
29 // Single instruction, single data.
30 template <typename T>
32 
33 // (Wrapper class required for overloading comparison operators.)
34 template <typename T>
35 struct Vec1 {
36  HWY_INLINE Vec1() = default;
37  Vec1(const Vec1&) = default;
38  Vec1& operator=(const Vec1&) = default;
39  HWY_INLINE explicit Vec1(const T t) : raw(t) {}
40 
41  HWY_INLINE Vec1& operator*=(const Vec1 other) {
42  return *this = (*this * other);
43  }
44  HWY_INLINE Vec1& operator/=(const Vec1 other) {
45  return *this = (*this / other);
46  }
47  HWY_INLINE Vec1& operator+=(const Vec1 other) {
48  return *this = (*this + other);
49  }
50  HWY_INLINE Vec1& operator-=(const Vec1 other) {
51  return *this = (*this - other);
52  }
53  HWY_INLINE Vec1& operator&=(const Vec1 other) {
54  return *this = (*this & other);
55  }
56  HWY_INLINE Vec1& operator|=(const Vec1 other) {
57  return *this = (*this | other);
58  }
59  HWY_INLINE Vec1& operator^=(const Vec1 other) {
60  return *this = (*this ^ other);
61  }
62 
63  T raw;
64 };
65 
66 // 0 or FF..FF, same size as Vec1.
67 template <typename T>
68 class Mask1 {
70 
71  public:
72  static HWY_INLINE Mask1<T> FromBool(bool b) {
73  Mask1<T> mask;
74  mask.bits = b ? ~Raw(0) : 0;
75  return mask;
76  }
77 
79 };
80 
81 namespace detail {
82 
83 // Deduce Sisd<T> from Vec1<T>
84 struct Deduce1 {
85  template <typename T>
87  return Sisd<T>();
88  }
89 };
90 
91 } // namespace detail
92 
93 template <class V>
94 using DFromV = decltype(detail::Deduce1()(V()));
95 
96 template <class V>
97 using TFromV = TFromD<DFromV<V>>;
98 
99 // ------------------------------ BitCast
100 
101 template <typename T, typename FromT>
103  static_assert(sizeof(T) <= sizeof(FromT), "Promoting is undefined");
104  T to;
105  CopyBytes<sizeof(FromT)>(&v.raw, &to);
106  return Vec1<T>(to);
107 }
108 
109 // ------------------------------ Set
110 
111 template <typename T>
113  return Vec1<T>(T(0));
114 }
115 
116 template <typename T, typename T2>
117 HWY_API Vec1<T> Set(Sisd<T> /* tag */, const T2 t) {
118  return Vec1<T>(static_cast<T>(t));
119 }
120 
121 template <typename T>
123  return Zero(d);
124 }
125 
126 template <typename T, typename T2>
127 HWY_API Vec1<T> Iota(const Sisd<T> /* tag */, const T2 first) {
128  return Vec1<T>(static_cast<T>(first));
129 }
130 
131 // ================================================== LOGICAL
132 
133 // ------------------------------ Not
134 
135 template <typename T>
137  using TU = MakeUnsigned<T>;
138  const Sisd<TU> du;
139  return BitCast(Sisd<T>(), Vec1<TU>(static_cast<TU>(~BitCast(du, v).raw)));
140 }
141 
142 // ------------------------------ And
143 
144 template <typename T>
145 HWY_API Vec1<T> And(const Vec1<T> a, const Vec1<T> b) {
146  using TU = MakeUnsigned<T>;
147  const Sisd<TU> du;
148  return BitCast(Sisd<T>(), Vec1<TU>(BitCast(du, a).raw & BitCast(du, b).raw));
149 }
150 template <typename T>
152  return And(a, b);
153 }
154 
155 // ------------------------------ AndNot
156 
157 template <typename T>
158 HWY_API Vec1<T> AndNot(const Vec1<T> a, const Vec1<T> b) {
159  using TU = MakeUnsigned<T>;
160  const Sisd<TU> du;
161  return BitCast(Sisd<T>(), Vec1<TU>(static_cast<TU>(~BitCast(du, a).raw &
162  BitCast(du, b).raw)));
163 }
164 
165 // ------------------------------ Or
166 
167 template <typename T>
168 HWY_API Vec1<T> Or(const Vec1<T> a, const Vec1<T> b) {
169  using TU = MakeUnsigned<T>;
170  const Sisd<TU> du;
171  return BitCast(Sisd<T>(), Vec1<TU>(BitCast(du, a).raw | BitCast(du, b).raw));
172 }
173 template <typename T>
175  return Or(a, b);
176 }
177 
178 // ------------------------------ Xor
179 
180 template <typename T>
181 HWY_API Vec1<T> Xor(const Vec1<T> a, const Vec1<T> b) {
182  using TU = MakeUnsigned<T>;
183  const Sisd<TU> du;
184  return BitCast(Sisd<T>(), Vec1<TU>(BitCast(du, a).raw ^ BitCast(du, b).raw));
185 }
186 template <typename T>
188  return Xor(a, b);
189 }
190 
191 // ------------------------------ OrAnd
192 
193 template <typename T>
194 HWY_API Vec1<T> OrAnd(const Vec1<T> o, const Vec1<T> a1, const Vec1<T> a2) {
195  return Or(o, And(a1, a2));
196 }
197 
198 // ------------------------------ IfVecThenElse
199 
200 template <typename T>
202  return IfThenElse(MaskFromVec(mask), yes, no);
203 }
204 
205 // ------------------------------ CopySign
206 
207 template <typename T>
208 HWY_API Vec1<T> CopySign(const Vec1<T> magn, const Vec1<T> sign) {
209  static_assert(IsFloat<T>(), "Only makes sense for floating-point");
210  const auto msb = SignBit(Sisd<T>());
211  return Or(AndNot(msb, magn), And(msb, sign));
212 }
213 
214 template <typename T>
215 HWY_API Vec1<T> CopySignToAbs(const Vec1<T> abs, const Vec1<T> sign) {
216  static_assert(IsFloat<T>(), "Only makes sense for floating-point");
217  return Or(abs, And(SignBit(Sisd<T>()), sign));
218 }
219 
220 // ------------------------------ BroadcastSignBit
221 
222 template <typename T>
224  // This is used inside ShiftRight, so we cannot implement in terms of it.
225  return v.raw < 0 ? Vec1<T>(T(-1)) : Vec1<T>(0);
226 }
227 
228 // ------------------------------ PopulationCount
229 
230 #ifdef HWY_NATIVE_POPCNT
231 #undef HWY_NATIVE_POPCNT
232 #else
233 #define HWY_NATIVE_POPCNT
234 #endif
235 
236 template <typename T>
238  return Vec1<T>(static_cast<T>(PopCount(v.raw)));
239 }
240 
241 // ------------------------------ Mask
242 
243 template <typename TFrom, typename TTo>
245  static_assert(sizeof(TFrom) == sizeof(TTo), "Must have same size");
246  return Mask1<TTo>{m.bits};
247 }
248 
249 // v must be 0 or FF..FF.
250 template <typename T>
252  Mask1<T> mask;
253  CopyBytes<sizeof(mask.bits)>(&v.raw, &mask.bits);
254  return mask;
255 }
256 
257 template <typename T>
259  Vec1<T> v;
260  CopyBytes<sizeof(v.raw)>(&mask.bits, &v.raw);
261  return v;
262 }
263 
264 template <typename T>
265 Vec1<T> VecFromMask(Sisd<T> /* tag */, const Mask1<T> mask) {
266  Vec1<T> v;
267  CopyBytes<sizeof(v.raw)>(&mask.bits, &v.raw);
268  return v;
269 }
270 
271 template <typename T>
272 HWY_API Mask1<T> FirstN(Sisd<T> /*tag*/, size_t n) {
273  return Mask1<T>::FromBool(n != 0);
274 }
275 
276 // Returns mask ? yes : no.
277 template <typename T>
278 HWY_API Vec1<T> IfThenElse(const Mask1<T> mask, const Vec1<T> yes,
279  const Vec1<T> no) {
280  return mask.bits ? yes : no;
281 }
282 
283 template <typename T>
285  return mask.bits ? yes : Vec1<T>(0);
286 }
287 
288 template <typename T>
290  return mask.bits ? Vec1<T>(0) : no;
291 }
292 
293 template <typename T>
295  return v.raw < 0 ? yes : no;
296 }
297 
298 template <typename T>
300  return v.raw < 0 ? Vec1<T>(0) : v;
301 }
302 
303 // ------------------------------ Mask logical
304 
305 template <typename T>
307  return MaskFromVec(Not(VecFromMask(Sisd<T>(), m)));
308 }
309 
310 template <typename T>
312  const Sisd<T> d;
313  return MaskFromVec(And(VecFromMask(d, a), VecFromMask(d, b)));
314 }
315 
316 template <typename T>
318  const Sisd<T> d;
319  return MaskFromVec(AndNot(VecFromMask(d, a), VecFromMask(d, b)));
320 }
321 
322 template <typename T>
324  const Sisd<T> d;
325  return MaskFromVec(Or(VecFromMask(d, a), VecFromMask(d, b)));
326 }
327 
328 template <typename T>
330  const Sisd<T> d;
331  return MaskFromVec(Xor(VecFromMask(d, a), VecFromMask(d, b)));
332 }
333 
334 // ================================================== SHIFTS
335 
336 // ------------------------------ ShiftLeft/ShiftRight (BroadcastSignBit)
337 
338 template <int kBits, typename T>
340  static_assert(0 <= kBits && kBits < sizeof(T) * 8, "Invalid shift");
341  return Vec1<T>(static_cast<hwy::MakeUnsigned<T>>(v.raw) << kBits);
342 }
343 
344 template <int kBits, typename T>
346  static_assert(0 <= kBits && kBits < sizeof(T) * 8, "Invalid shift");
347 #if __cplusplus >= 202002L
348  // Signed right shift is now guaranteed to be arithmetic (rounding toward
349  // negative infinity, i.e. shifting in the sign bit).
350  return Vec1<T>(v.raw >> kBits);
351 #else
352  if (IsSigned<T>()) {
353  // Emulate arithmetic shift using only logical (unsigned) shifts, because
354  // signed shifts are still implementation-defined.
355  using TU = hwy::MakeUnsigned<T>;
356  const Sisd<TU> du;
357  const TU shifted = BitCast(du, v).raw >> kBits;
358  const TU sign = BitCast(du, BroadcastSignBit(v)).raw;
359  const TU upper = sign << (sizeof(TU) * 8 - 1 - kBits);
360  return BitCast(Sisd<T>(), Vec1<TU>(shifted | upper));
361  } else {
362  return Vec1<T>(v.raw >> kBits); // unsigned, logical shift
363  }
364 #endif
365 }
366 
367 // ------------------------------ RotateRight (ShiftRight)
368 
369 template <int kBits, typename T>
371  static_assert(0 <= kBits && kBits < sizeof(T) * 8, "Invalid shift");
372  if (kBits == 0) return v;
373  return Or(ShiftRight<kBits>(v), ShiftLeft<sizeof(T) * 8 - kBits>(v));
374 }
375 
376 // ------------------------------ ShiftLeftSame (BroadcastSignBit)
377 
378 template <typename T>
380  return Vec1<T>(static_cast<hwy::MakeUnsigned<T>>(v.raw) << bits);
381 }
382 
383 template <typename T>
385 #if __cplusplus >= 202002L
386  // Signed right shift is now guaranteed to be arithmetic (rounding toward
387  // negative infinity, i.e. shifting in the sign bit).
388  return Vec1<T>(v.raw >> bits);
389 #else
390  if (IsSigned<T>()) {
391  // Emulate arithmetic shift using only logical (unsigned) shifts, because
392  // signed shifts are still implementation-defined.
393  using TU = hwy::MakeUnsigned<T>;
394  const Sisd<TU> du;
395  const TU shifted = BitCast(du, v).raw >> bits;
396  const TU sign = BitCast(du, BroadcastSignBit(v)).raw;
397  const TU upper = sign << (sizeof(TU) * 8 - 1 - bits);
398  return BitCast(Sisd<T>(), Vec1<TU>(shifted | upper));
399  } else {
400  return Vec1<T>(v.raw >> bits); // unsigned, logical shift
401  }
402 #endif
403 }
404 
405 // ------------------------------ Shl
406 
407 // Single-lane => same as ShiftLeftSame except for the argument type.
408 template <typename T>
410  return ShiftLeftSame(v, static_cast<int>(bits.raw));
411 }
412 
413 template <typename T>
415  return ShiftRightSame(v, static_cast<int>(bits.raw));
416 }
417 
418 // ================================================== ARITHMETIC
419 
420 template <typename T>
422  const uint64_t a64 = static_cast<uint64_t>(a.raw);
423  const uint64_t b64 = static_cast<uint64_t>(b.raw);
424  return Vec1<T>(static_cast<T>((a64 + b64) & static_cast<uint64_t>(~T(0))));
425 }
427  return Vec1<float>(a.raw + b.raw);
428 }
430  return Vec1<double>(a.raw + b.raw);
431 }
432 
433 template <typename T>
435  const uint64_t a64 = static_cast<uint64_t>(a.raw);
436  const uint64_t b64 = static_cast<uint64_t>(b.raw);
437  return Vec1<T>(static_cast<T>((a64 - b64) & static_cast<uint64_t>(~T(0))));
438 }
440  return Vec1<float>(a.raw - b.raw);
441 }
443  return Vec1<double>(a.raw - b.raw);
444 }
445 
446 // ------------------------------ SumsOf8
447 
449  return Vec1<uint64_t>(v.raw);
450 }
451 
452 // ------------------------------ SaturatedAdd
453 
454 // Returns a + b clamped to the destination range.
455 
456 // Unsigned
458  const Vec1<uint8_t> b) {
459  return Vec1<uint8_t>(
460  static_cast<uint8_t>(HWY_MIN(HWY_MAX(0, a.raw + b.raw), 255)));
461 }
463  const Vec1<uint16_t> b) {
464  return Vec1<uint16_t>(
465  static_cast<uint16_t>(HWY_MIN(HWY_MAX(0, a.raw + b.raw), 65535)));
466 }
467 
468 // Signed
470  return Vec1<int8_t>(
471  static_cast<int8_t>(HWY_MIN(HWY_MAX(-128, a.raw + b.raw), 127)));
472 }
474  const Vec1<int16_t> b) {
475  return Vec1<int16_t>(
476  static_cast<int16_t>(HWY_MIN(HWY_MAX(-32768, a.raw + b.raw), 32767)));
477 }
478 
479 // ------------------------------ Saturating subtraction
480 
481 // Returns a - b clamped to the destination range.
482 
483 // Unsigned
485  const Vec1<uint8_t> b) {
486  return Vec1<uint8_t>(
487  static_cast<uint8_t>(HWY_MIN(HWY_MAX(0, a.raw - b.raw), 255)));
488 }
490  const Vec1<uint16_t> b) {
491  return Vec1<uint16_t>(
492  static_cast<uint16_t>(HWY_MIN(HWY_MAX(0, a.raw - b.raw), 65535)));
493 }
494 
495 // Signed
497  return Vec1<int8_t>(
498  static_cast<int8_t>(HWY_MIN(HWY_MAX(-128, a.raw - b.raw), 127)));
499 }
501  const Vec1<int16_t> b) {
502  return Vec1<int16_t>(
503  static_cast<int16_t>(HWY_MIN(HWY_MAX(-32768, a.raw - b.raw), 32767)));
504 }
505 
506 // ------------------------------ Average
507 
508 // Returns (a + b + 1) / 2
509 
511  const Vec1<uint8_t> b) {
512  return Vec1<uint8_t>(static_cast<uint8_t>((a.raw + b.raw + 1) / 2));
513 }
515  const Vec1<uint16_t> b) {
516  return Vec1<uint16_t>(static_cast<uint16_t>((a.raw + b.raw + 1) / 2));
517 }
518 
519 // ------------------------------ Absolute value
520 
521 template <typename T>
523  const T i = a.raw;
524  return (i >= 0 || i == hwy::LimitsMin<T>()) ? a : Vec1<T>(-i);
525 }
527  return Vec1<float>(std::abs(a.raw));
528 }
530  return Vec1<double>(std::abs(a.raw));
531 }
532 
533 // ------------------------------ min/max
534 
535 template <typename T, HWY_IF_NOT_FLOAT(T)>
536 HWY_API Vec1<T> Min(const Vec1<T> a, const Vec1<T> b) {
537  return Vec1<T>(HWY_MIN(a.raw, b.raw));
538 }
539 
540 template <typename T, HWY_IF_FLOAT(T)>
541 HWY_API Vec1<T> Min(const Vec1<T> a, const Vec1<T> b) {
542  if (std::isnan(a.raw)) return b;
543  if (std::isnan(b.raw)) return a;
544  return Vec1<T>(HWY_MIN(a.raw, b.raw));
545 }
546 
547 template <typename T, HWY_IF_NOT_FLOAT(T)>
548 HWY_API Vec1<T> Max(const Vec1<T> a, const Vec1<T> b) {
549  return Vec1<T>(HWY_MAX(a.raw, b.raw));
550 }
551 
552 template <typename T, HWY_IF_FLOAT(T)>
553 HWY_API Vec1<T> Max(const Vec1<T> a, const Vec1<T> b) {
554  if (std::isnan(a.raw)) return b;
555  if (std::isnan(b.raw)) return a;
556  return Vec1<T>(HWY_MAX(a.raw, b.raw));
557 }
558 
559 // ------------------------------ Floating-point negate
560 
561 template <typename T, HWY_IF_FLOAT(T)>
563  return Xor(v, SignBit(Sisd<T>()));
564 }
565 
566 template <typename T, HWY_IF_NOT_FLOAT(T)>
567 HWY_API Vec1<T> Neg(const Vec1<T> v) {
568  return Zero(Sisd<T>()) - v;
569 }
570 
571 // ------------------------------ mul/div
572 
573 template <typename T, HWY_IF_FLOAT(T)>
575  return Vec1<T>(static_cast<T>(double(a.raw) * b.raw));
576 }
577 
578 template <typename T, HWY_IF_SIGNED(T)>
579 HWY_API Vec1<T> operator*(const Vec1<T> a, const Vec1<T> b) {
580  return Vec1<T>(static_cast<T>(int64_t(a.raw) * b.raw));
581 }
582 
583 template <typename T, HWY_IF_UNSIGNED(T)>
584 HWY_API Vec1<T> operator*(const Vec1<T> a, const Vec1<T> b) {
585  return Vec1<T>(static_cast<T>(uint64_t(a.raw) * b.raw));
586 }
587 
588 template <typename T>
590  return Vec1<T>(a.raw / b.raw);
591 }
592 
593 // Returns the upper 16 bits of a * b in each lane.
595  return Vec1<int16_t>(static_cast<int16_t>((a.raw * b.raw) >> 16));
596 }
598  // Cast to uint32_t first to prevent overflow. Otherwise the result of
599  // uint16_t * uint16_t is in "int" which may overflow. In practice the result
600  // is the same but this way it is also defined.
601  return Vec1<uint16_t>(static_cast<uint16_t>(
602  (static_cast<uint32_t>(a.raw) * static_cast<uint32_t>(b.raw)) >> 16));
603 }
604 
606  return Vec1<int16_t>(static_cast<int16_t>((2 * a.raw * b.raw + 32768) >> 16));
607 }
608 
609 // Multiplies even lanes (0, 2 ..) and returns the double-wide result.
611  const int64_t a64 = a.raw;
612  return Vec1<int64_t>(a64 * b.raw);
613 }
615  const uint64_t a64 = a.raw;
616  return Vec1<uint64_t>(a64 * b.raw);
617 }
618 
619 // Approximate reciprocal
621  // Zero inputs are allowed, but callers are responsible for replacing the
622  // return value with something else (typically using IfThenElse). This check
623  // avoids a ubsan error. The return value is arbitrary.
624  if (v.raw == 0.0f) return Vec1<float>(0.0f);
625  return Vec1<float>(1.0f / v.raw);
626 }
627 
628 // Absolute value of difference.
630  return Abs(a - b);
631 }
632 
633 // ------------------------------ Floating-point multiply-add variants
634 
635 template <typename T>
636 HWY_API Vec1<T> MulAdd(const Vec1<T> mul, const Vec1<T> x, const Vec1<T> add) {
637  return mul * x + add;
638 }
639 
640 template <typename T>
642  const Vec1<T> add) {
643  return add - mul * x;
644 }
645 
646 template <typename T>
647 HWY_API Vec1<T> MulSub(const Vec1<T> mul, const Vec1<T> x, const Vec1<T> sub) {
648  return mul * x - sub;
649 }
650 
651 template <typename T>
653  const Vec1<T> sub) {
654  return Neg(mul) * x - sub;
655 }
656 
657 // ------------------------------ Floating-point square root
658 
659 // Approximate reciprocal square root
661  float f = v.raw;
662  const float half = f * 0.5f;
663  uint32_t bits;
664  CopyBytes<4>(&f, &bits);
665  // Initial guess based on log2(f)
666  bits = 0x5F3759DF - (bits >> 1);
667  CopyBytes<4>(&bits, &f);
668  // One Newton-Raphson iteration
669  return Vec1<float>(f * (1.5f - (half * f * f)));
670 }
671 
672 // Square root
674  return Vec1<float>(std::sqrt(v.raw));
675 }
677  return Vec1<double>(std::sqrt(v.raw));
678 }
679 
680 // ------------------------------ Floating-point rounding
681 
682 template <typename T>
684  using TI = MakeSigned<T>;
685  if (!(Abs(v).raw < MantissaEnd<T>())) { // Huge or NaN
686  return v;
687  }
688  const T bias = v.raw < T(0.0) ? T(-0.5) : T(0.5);
689  const TI rounded = static_cast<TI>(v.raw + bias);
690  if (rounded == 0) return CopySignToAbs(Vec1<T>(0), v);
691  // Round to even
692  if ((rounded & 1) && std::abs(rounded - v.raw) == T(0.5)) {
693  return Vec1<T>(static_cast<T>(rounded - (v.raw < T(0) ? -1 : 1)));
694  }
695  return Vec1<T>(static_cast<T>(rounded));
696 }
697 
698 // Round-to-nearest even.
700  using T = float;
701  using TI = int32_t;
702 
703  const T abs = Abs(v).raw;
704  const bool signbit = std::signbit(v.raw);
705 
706  if (!(abs < MantissaEnd<T>())) { // Huge or NaN
707  // Check if too large to cast or NaN
708  if (!(abs <= static_cast<T>(LimitsMax<TI>()))) {
709  return Vec1<TI>(signbit ? LimitsMin<TI>() : LimitsMax<TI>());
710  }
711  return Vec1<int32_t>(static_cast<TI>(v.raw));
712  }
713  const T bias = v.raw < T(0.0) ? T(-0.5) : T(0.5);
714  const TI rounded = static_cast<TI>(v.raw + bias);
715  if (rounded == 0) return Vec1<int32_t>(0);
716  // Round to even
717  if ((rounded & 1) && std::abs(static_cast<T>(rounded) - v.raw) == T(0.5)) {
718  return Vec1<TI>(rounded - (signbit ? -1 : 1));
719  }
720  return Vec1<TI>(rounded);
721 }
722 
723 template <typename T>
725  using TI = MakeSigned<T>;
726  if (!(Abs(v).raw <= MantissaEnd<T>())) { // Huge or NaN
727  return v;
728  }
729  const TI truncated = static_cast<TI>(v.raw);
730  if (truncated == 0) return CopySignToAbs(Vec1<T>(0), v);
731  return Vec1<T>(static_cast<T>(truncated));
732 }
733 
734 template <typename Float, typename Bits, int kMantissaBits, int kExponentBits,
735  class V>
736 V Ceiling(const V v) {
737  const Bits kExponentMask = (1ull << kExponentBits) - 1;
738  const Bits kMantissaMask = (1ull << kMantissaBits) - 1;
739  const Bits kBias = kExponentMask / 2;
740 
741  Float f = v.raw;
742  const bool positive = f > Float(0.0);
743 
744  Bits bits;
745  CopyBytes<sizeof(Bits)>(&v, &bits);
746 
747  const int exponent =
748  static_cast<int>(((bits >> kMantissaBits) & kExponentMask) - kBias);
749  // Already an integer.
750  if (exponent >= kMantissaBits) return v;
751  // |v| <= 1 => 0 or 1.
752  if (exponent < 0) return positive ? V(1) : V(-0.0);
753 
754  const Bits mantissa_mask = kMantissaMask >> exponent;
755  // Already an integer
756  if ((bits & mantissa_mask) == 0) return v;
757 
758  // Clear fractional bits and round up
759  if (positive) bits += (kMantissaMask + 1) >> exponent;
760  bits &= ~mantissa_mask;
761 
762  CopyBytes<sizeof(Bits)>(&bits, &f);
763  return V(f);
764 }
765 
766 template <typename Float, typename Bits, int kMantissaBits, int kExponentBits,
767  class V>
768 V Floor(const V v) {
769  const Bits kExponentMask = (1ull << kExponentBits) - 1;
770  const Bits kMantissaMask = (1ull << kMantissaBits) - 1;
771  const Bits kBias = kExponentMask / 2;
772 
773  Float f = v.raw;
774  const bool negative = f < Float(0.0);
775 
776  Bits bits;
777  CopyBytes<sizeof(Bits)>(&v, &bits);
778 
779  const int exponent =
780  static_cast<int>(((bits >> kMantissaBits) & kExponentMask) - kBias);
781  // Already an integer.
782  if (exponent >= kMantissaBits) return v;
783  // |v| <= 1 => -1 or 0.
784  if (exponent < 0) return V(negative ? Float(-1.0) : Float(0.0));
785 
786  const Bits mantissa_mask = kMantissaMask >> exponent;
787  // Already an integer
788  if ((bits & mantissa_mask) == 0) return v;
789 
790  // Clear fractional bits and round down
791  if (negative) bits += (kMantissaMask + 1) >> exponent;
792  bits &= ~mantissa_mask;
793 
794  CopyBytes<sizeof(Bits)>(&bits, &f);
795  return V(f);
796 }
797 
798 // Toward +infinity, aka ceiling
800  return Ceiling<float, uint32_t, 23, 8>(v);
801 }
803  return Ceiling<double, uint64_t, 52, 11>(v);
804 }
805 
806 // Toward -infinity, aka floor
808  return Floor<float, uint32_t, 23, 8>(v);
809 }
811  return Floor<double, uint64_t, 52, 11>(v);
812 }
813 
814 // ================================================== COMPARE
815 
816 template <typename T>
818  return Mask1<T>::FromBool(a.raw == b.raw);
819 }
820 
821 template <typename T>
823  return Mask1<T>::FromBool(a.raw != b.raw);
824 }
825 
826 template <typename T>
828  static_assert(!hwy::IsFloat<T>(), "Only integer vectors supported");
829  return (v & bit) == bit;
830 }
831 
832 template <typename T>
834  return Mask1<T>::FromBool(a.raw < b.raw);
835 }
836 template <typename T>
838  return Mask1<T>::FromBool(a.raw > b.raw);
839 }
840 
841 template <typename T>
843  return Mask1<T>::FromBool(a.raw <= b.raw);
844 }
845 template <typename T>
847  return Mask1<T>::FromBool(a.raw >= b.raw);
848 }
849 
850 // ================================================== MEMORY
851 
852 // ------------------------------ Load
853 
854 template <typename T>
855 HWY_API Vec1<T> Load(Sisd<T> /* tag */, const T* HWY_RESTRICT aligned) {
856  T t;
857  CopyBytes<sizeof(T)>(aligned, &t);
858  return Vec1<T>(t);
859 }
860 
861 template <typename T>
863  const T* HWY_RESTRICT aligned) {
864  return IfThenElseZero(m, Load(d, aligned));
865 }
866 
867 template <typename T>
869  return Load(d, p);
870 }
871 
872 // In some use cases, "load single lane" is sufficient; otherwise avoid this.
873 template <typename T>
875  return Load(d, aligned);
876 }
877 
878 // ------------------------------ Store
879 
880 template <typename T>
881 HWY_API void Store(const Vec1<T> v, Sisd<T> /* tag */,
882  T* HWY_RESTRICT aligned) {
883  CopyBytes<sizeof(T)>(&v.raw, aligned);
884 }
885 
886 template <typename T>
888  return Store(v, d, p);
889 }
890 
891 template <typename T>
893  T* HWY_RESTRICT p) {
894  if (!m.bits) return;
895  StoreU(v, d, p);
896 }
897 
898 // ------------------------------ StoreInterleaved3
899 
901  const Vec1<uint8_t> v2, Sisd<uint8_t> d,
902  uint8_t* HWY_RESTRICT unaligned) {
903  StoreU(v0, d, unaligned + 0);
904  StoreU(v1, d, unaligned + 1);
905  StoreU(v2, d, unaligned + 2);
906 }
907 
909  const Vec1<uint8_t> v2, const Vec1<uint8_t> v3,
911  uint8_t* HWY_RESTRICT unaligned) {
912  StoreU(v0, d, unaligned + 0);
913  StoreU(v1, d, unaligned + 1);
914  StoreU(v2, d, unaligned + 2);
915  StoreU(v3, d, unaligned + 3);
916 }
917 
918 // ------------------------------ Stream
919 
920 template <typename T>
921 HWY_API void Stream(const Vec1<T> v, Sisd<T> d, T* HWY_RESTRICT aligned) {
922  return Store(v, d, aligned);
923 }
924 
925 // ------------------------------ Scatter
926 
927 template <typename T, typename Offset>
929  const Vec1<Offset> offset) {
930  static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
931  uint8_t* const base8 = reinterpret_cast<uint8_t*>(base) + offset.raw;
932  return Store(v, d, reinterpret_cast<T*>(base8));
933 }
934 
935 template <typename T, typename Index>
937  const Vec1<Index> index) {
938  static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
939  return Store(v, d, base + index.raw);
940 }
941 
942 // ------------------------------ Gather
943 
944 template <typename T, typename Offset>
946  const Vec1<Offset> offset) {
947  static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
948  const uintptr_t addr = reinterpret_cast<uintptr_t>(base) + offset.raw;
949  return Load(d, reinterpret_cast<const T*>(addr));
950 }
951 
952 template <typename T, typename Index>
954  const Vec1<Index> index) {
955  static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
956  return Load(d, base + index.raw);
957 }
958 
959 // ================================================== CONVERT
960 
961 // ConvertTo and DemoteTo with floating-point input and integer output truncate
962 // (rounding toward zero).
963 
964 template <typename FromT, typename ToT>
966  static_assert(sizeof(ToT) > sizeof(FromT), "Not promoting");
967  // For bits Y > X, floatX->floatY and intX->intY are always representable.
968  return Vec1<ToT>(static_cast<ToT>(from.raw));
969 }
970 
971 // MSVC 19.10 cannot deduce the argument type if HWY_IF_FLOAT(FromT) is here,
972 // so we overload for FromT=double and ToT={float,int32_t}.
974  // Prevent ubsan errors when converting float to narrower integer/float
975  if (std::isinf(from.raw) ||
976  std::fabs(from.raw) > static_cast<double>(HighestValue<float>())) {
977  return Vec1<float>(std::signbit(from.raw) ? LowestValue<float>()
978  : HighestValue<float>());
979  }
980  return Vec1<float>(static_cast<float>(from.raw));
981 }
983  // Prevent ubsan errors when converting int32_t to narrower integer/int32_t
984  if (std::isinf(from.raw) ||
985  std::fabs(from.raw) > static_cast<double>(HighestValue<int32_t>())) {
986  return Vec1<int32_t>(std::signbit(from.raw) ? LowestValue<int32_t>()
987  : HighestValue<int32_t>());
988  }
989  return Vec1<int32_t>(static_cast<int32_t>(from.raw));
990 }
991 
992 template <typename FromT, typename ToT>
994  static_assert(!IsFloat<FromT>(), "FromT=double are handled above");
995  static_assert(sizeof(ToT) < sizeof(FromT), "Not demoting");
996 
997  // Int to int: choose closest value in ToT to `from` (avoids UB)
998  from.raw = HWY_MIN(HWY_MAX(LimitsMin<ToT>(), from.raw), LimitsMax<ToT>());
999  return Vec1<ToT>(static_cast<ToT>(from.raw));
1000 }
1001 
1003 #if HWY_NATIVE_FLOAT16
1004  uint16_t bits16;
1005  CopyBytes<2>(&v.raw, &bits16);
1006 #else
1007  const uint16_t bits16 = v.raw.bits;
1008 #endif
1009  const uint32_t sign = static_cast<uint32_t>(bits16 >> 15);
1010  const uint32_t biased_exp = (bits16 >> 10) & 0x1F;
1011  const uint32_t mantissa = bits16 & 0x3FF;
1012 
1013  // Subnormal or zero
1014  if (biased_exp == 0) {
1015  const float subnormal =
1016  (1.0f / 16384) * (static_cast<float>(mantissa) * (1.0f / 1024));
1017  return Vec1<float>(sign ? -subnormal : subnormal);
1018  }
1019 
1020  // Normalized: convert the representation directly (faster than ldexp/tables).
1021  const uint32_t biased_exp32 = biased_exp + (127 - 15);
1022  const uint32_t mantissa32 = mantissa << (23 - 10);
1023  const uint32_t bits32 = (sign << 31) | (biased_exp32 << 23) | mantissa32;
1024  float out;
1025  CopyBytes<4>(&bits32, &out);
1026  return Vec1<float>(out);
1027 }
1028 
1030  return Set(d, F32FromBF16(v.raw));
1031 }
1032 
1034  const Vec1<float> v) {
1035  uint32_t bits32;
1036  CopyBytes<4>(&v.raw, &bits32);
1037  const uint32_t sign = bits32 >> 31;
1038  const uint32_t biased_exp32 = (bits32 >> 23) & 0xFF;
1039  const uint32_t mantissa32 = bits32 & 0x7FFFFF;
1040 
1041  const int32_t exp = HWY_MIN(static_cast<int32_t>(biased_exp32) - 127, 15);
1042 
1043  // Tiny or zero => zero.
1044  Vec1<float16_t> out;
1045  if (exp < -24) {
1046 #if HWY_NATIVE_FLOAT16
1047  const uint16_t zero = 0;
1048  CopyBytes<2>(&zero, &out.raw);
1049 #else
1050  out.raw.bits = 0;
1051 #endif
1052  return out;
1053  }
1054 
1055  uint32_t biased_exp16, mantissa16;
1056 
1057  // exp = [-24, -15] => subnormal
1058  if (exp < -14) {
1059  biased_exp16 = 0;
1060  const uint32_t sub_exp = static_cast<uint32_t>(-14 - exp);
1061  HWY_DASSERT(1 <= sub_exp && sub_exp < 11);
1062  mantissa16 = static_cast<uint32_t>((1u << (10 - sub_exp)) +
1063  (mantissa32 >> (13 + sub_exp)));
1064  } else {
1065  // exp = [-14, 15]
1066  biased_exp16 = static_cast<uint32_t>(exp + 15);
1067  HWY_DASSERT(1 <= biased_exp16 && biased_exp16 < 31);
1068  mantissa16 = mantissa32 >> 13;
1069  }
1070 
1071  HWY_DASSERT(mantissa16 < 1024);
1072  const uint32_t bits16 = (sign << 15) | (biased_exp16 << 10) | mantissa16;
1073  HWY_DASSERT(bits16 < 0x10000);
1074 #if HWY_NATIVE_FLOAT16
1075  const uint16_t narrowed = static_cast<uint16_t>(bits16); // big-endian safe
1076  CopyBytes<2>(&narrowed, &out.raw);
1077 #else
1078  out.raw.bits = static_cast<uint16_t>(bits16);
1079 #endif
1080  return out;
1081 }
1082 
1084  return Set(d, BF16FromF32(v.raw));
1085 }
1086 
1087 template <typename FromT, typename ToT, HWY_IF_FLOAT(FromT)>
1089  static_assert(sizeof(ToT) == sizeof(FromT), "Should have same size");
1090  // float## -> int##: return closest representable value. We cannot exactly
1091  // represent LimitsMax<ToT> in FromT, so use double.
1092  const double f = static_cast<double>(from.raw);
1093  if (std::isinf(from.raw) ||
1094  std::fabs(f) > static_cast<double>(LimitsMax<ToT>())) {
1095  return Vec1<ToT>(std::signbit(from.raw) ? LimitsMin<ToT>()
1096  : LimitsMax<ToT>());
1097  }
1098  return Vec1<ToT>(static_cast<ToT>(from.raw));
1099 }
1100 
1101 template <typename FromT, typename ToT, HWY_IF_NOT_FLOAT(FromT)>
1102 HWY_API Vec1<ToT> ConvertTo(Sisd<ToT> /* tag */, Vec1<FromT> from) {
1103  static_assert(sizeof(ToT) == sizeof(FromT), "Should have same size");
1104  // int## -> float##: no check needed
1105  return Vec1<ToT>(static_cast<ToT>(from.raw));
1106 }
1107 
1109  return DemoteTo(Sisd<uint8_t>(), v);
1110 }
1111 
1112 // ================================================== COMBINE
1113 // UpperHalf, ZeroExtendVector, Combine, Concat* are unsupported.
1114 
1115 template <typename T>
1117  return v;
1118 }
1119 
1120 template <typename T>
1122  return v;
1123 }
1124 
1125 // ================================================== SWIZZLE
1126 
1127 template <typename T>
1129  return v.raw;
1130 }
1131 
1132 template <typename T>
1134  return v;
1135 }
1136 // DupOdd is unsupported.
1137 
1138 template <typename T>
1140  return even;
1141 }
1142 
1143 template <typename T>
1145  return even;
1146 }
1147 
1148 // ------------------------------ SwapAdjacentBlocks
1149 
1150 template <typename T>
1152  return v;
1153 }
1154 
1155 // ------------------------------ TableLookupLanes
1156 
1157 // Returned by SetTableIndices for use by TableLookupLanes.
1158 template <typename T>
1159 struct Indices1 {
1161 };
1162 
1163 template <typename T, typename TI>
1165  static_assert(sizeof(T) == sizeof(TI), "Index size must match lane size");
1166  HWY_DASSERT(vec.raw == 0);
1167  return Indices1<T>{vec.raw};
1168 }
1169 
1170 template <typename T, typename TI>
1172  return IndicesFromVec(d, LoadU(idx));
1173 }
1174 
1175 template <typename T>
1177  return v;
1178 }
1179 
1180 // ------------------------------ ReverseBlocks
1181 
1182 // Single block: no change
1183 template <typename T>
1185  return v;
1186 }
1187 
1188 // ------------------------------ Reverse
1189 
1190 template <typename T>
1192  return v;
1193 }
1194 
1195 template <typename T>
1197  return v;
1198 }
1199 
1200 template <typename T>
1202  return v;
1203 }
1204 
1205 template <typename T>
1207  return v;
1208 }
1209 
1210 // ================================================== BLOCKWISE
1211 // Shift*Bytes, CombineShiftRightBytes, Interleave*, Shuffle* are unsupported.
1212 
1213 // ------------------------------ Broadcast/splat any lane
1214 
1215 template <int kLane, typename T>
1217  static_assert(kLane == 0, "Scalar only has one lane");
1218  return v;
1219 }
1220 
1221 // ------------------------------ TableLookupBytes, TableLookupBytesOr0
1222 
1223 template <typename T, typename TI>
1225  uint8_t in_bytes[sizeof(T)];
1226  uint8_t idx_bytes[sizeof(T)];
1227  uint8_t out_bytes[sizeof(T)];
1228  CopyBytes<sizeof(T)>(&in, &in_bytes);
1229  CopyBytes<sizeof(T)>(&indices, &idx_bytes);
1230  for (size_t i = 0; i < sizeof(T); ++i) {
1231  out_bytes[i] = in_bytes[idx_bytes[i]];
1232  }
1233  TI out;
1234  CopyBytes<sizeof(TI)>(&out_bytes, &out);
1235  return Vec1<TI>{out};
1236 }
1237 
1238 template <typename T, typename TI>
1240  uint8_t in_bytes[sizeof(T)];
1241  uint8_t idx_bytes[sizeof(T)];
1242  uint8_t out_bytes[sizeof(T)];
1243  CopyBytes<sizeof(T)>(&in, &in_bytes);
1244  CopyBytes<sizeof(T)>(&indices, &idx_bytes);
1245  for (size_t i = 0; i < sizeof(T); ++i) {
1246  out_bytes[i] = idx_bytes[i] & 0x80 ? 0 : in_bytes[idx_bytes[i]];
1247  }
1248  TI out;
1249  CopyBytes<sizeof(TI)>(&out_bytes, &out);
1250  return Vec1<TI>{out};
1251 }
1252 
1253 // ------------------------------ ZipLower
1254 
1256  return Vec1<uint16_t>(static_cast<uint16_t>((uint32_t(b.raw) << 8) + a.raw));
1257 }
1259  const Vec1<uint16_t> b) {
1260  return Vec1<uint32_t>((uint32_t(b.raw) << 16) + a.raw);
1261 }
1263  const Vec1<uint32_t> b) {
1264  return Vec1<uint64_t>((uint64_t(b.raw) << 32) + a.raw);
1265 }
1267  return Vec1<int16_t>(static_cast<int16_t>((int32_t(b.raw) << 8) + a.raw));
1268 }
1270  return Vec1<int32_t>((int32_t(b.raw) << 16) + a.raw);
1271 }
1273  return Vec1<int64_t>((int64_t(b.raw) << 32) + a.raw);
1274 }
1275 
1276 template <typename T, typename TW = MakeWide<T>, class VW = Vec1<TW>>
1277 HWY_API VW ZipLower(Sisd<TW> /* tag */, Vec1<T> a, Vec1<T> b) {
1278  return VW(static_cast<TW>((TW{b.raw} << (sizeof(T) * 8)) + a.raw));
1279 }
1280 
1281 // ================================================== MASK
1282 
1283 template <typename T>
1284 HWY_API bool AllFalse(Sisd<T> /* tag */, const Mask1<T> mask) {
1285  return mask.bits == 0;
1286 }
1287 
1288 template <typename T>
1289 HWY_API bool AllTrue(Sisd<T> /* tag */, const Mask1<T> mask) {
1290  return mask.bits != 0;
1291 }
1292 
1293 // `p` points to at least 8 readable bytes, not all of which need be valid.
1294 template <typename T>
1296  const uint8_t* HWY_RESTRICT bits) {
1297  return Mask1<T>::FromBool((bits[0] & 1) != 0);
1298 }
1299 
1300 // `p` points to at least 8 writable bytes.
1301 template <typename T>
1302 HWY_API size_t StoreMaskBits(Sisd<T> d, const Mask1<T> mask, uint8_t* bits) {
1303  *bits = AllTrue(d, mask);
1304  return 1;
1305 }
1306 
1307 template <typename T>
1308 HWY_API size_t CountTrue(Sisd<T> /* tag */, const Mask1<T> mask) {
1309  return mask.bits == 0 ? 0 : 1;
1310 }
1311 
1312 template <typename T>
1313 HWY_API intptr_t FindFirstTrue(Sisd<T> /* tag */, const Mask1<T> mask) {
1314  return mask.bits == 0 ? -1 : 0;
1315 }
1316 
1317 // ------------------------------ Compress, CompressBits
1318 
1319 template <typename T>
1320 struct CompressIsPartition {
1321  enum { value = 1 };
1322 };
1323 
1324 template <typename T>
1326  // Upper lanes are undefined, so result is the same independent of mask.
1327  return v;
1328 }
1329 
1330 template <typename T>
1331 HWY_API Vec1<T> Compress(Vec1<T> v, const uint8_t* HWY_RESTRICT /* bits */) {
1332  return v;
1333 }
1334 
1335 // ------------------------------ CompressStore
1336 
1337 template <typename T>
1339  T* HWY_RESTRICT unaligned) {
1340  StoreU(Compress(v, mask), d, unaligned);
1341  return CountTrue(d, mask);
1342 }
1343 
1344 // ------------------------------ CompressBlendedStore
1345 
1346 template <typename T>
1348  T* HWY_RESTRICT unaligned) {
1349  if (!mask.bits) return 0;
1350  StoreU(v, d, unaligned);
1351  return 1;
1352 }
1353 
1354 // ------------------------------ CompressBitsStore
1355 
1356 template <typename T>
1357 HWY_API size_t CompressBitsStore(Vec1<T> v, const uint8_t* HWY_RESTRICT bits,
1358  Sisd<T> d, T* HWY_RESTRICT unaligned) {
1359  const Mask1<T> mask = LoadMaskBits(d, bits);
1360  StoreU(Compress(v, mask), d, unaligned);
1361  return CountTrue(d, mask);
1362 }
1363 
1364 // ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower)
1365 
1367  Vec1<bfloat16_t> a,
1368  Vec1<bfloat16_t> b,
1369  const Vec1<float> sum0,
1370  Vec1<float>& /* sum1 */) {
1371  return MulAdd(Vec1<float>(F32FromBF16(a.raw)),
1372  Vec1<float>(F32FromBF16(b.raw)), sum0);
1373 }
1374 
1375 // ================================================== REDUCTIONS
1376 
1377 // Sum of all lanes, i.e. the only one.
1378 template <typename T>
1380  return v;
1381 }
1382 template <typename T>
1384  return v;
1385 }
1386 template <typename T>
1388  return v;
1389 }
1390 
1391 // ================================================== Operator wrapper
1392 
1393 template <class V>
1394 HWY_API V Add(V a, V b) {
1395  return a + b;
1396 }
1397 template <class V>
1398 HWY_API V Sub(V a, V b) {
1399  return a - b;
1400 }
1401 
1402 template <class V>
1403 HWY_API V Mul(V a, V b) {
1404  return a * b;
1405 }
1406 template <class V>
1407 HWY_API V Div(V a, V b) {
1408  return a / b;
1409 }
1410 
1411 template <class V>
1412 V Shl(V a, V b) {
1413  return a << b;
1414 }
1415 template <class V>
1416 V Shr(V a, V b) {
1417  return a >> b;
1418 }
1419 
1420 template <class V>
1421 HWY_API auto Eq(V a, V b) -> decltype(a == b) {
1422  return a == b;
1423 }
1424 template <class V>
1425 HWY_API auto Ne(V a, V b) -> decltype(a == b) {
1426  return a != b;
1427 }
1428 template <class V>
1429 HWY_API auto Lt(V a, V b) -> decltype(a == b) {
1430  return a < b;
1431 }
1432 
1433 template <class V>
1434 HWY_API auto Gt(V a, V b) -> decltype(a == b) {
1435  return a > b;
1436 }
1437 template <class V>
1438 HWY_API auto Ge(V a, V b) -> decltype(a == b) {
1439  return a >= b;
1440 }
1441 
1442 template <class V>
1443 HWY_API auto Le(V a, V b) -> decltype(a == b) {
1444  return a <= b;
1445 }
1446 
1447 // NOLINTNEXTLINE(google-readability-namespace-comments)
1448 } // namespace HWY_NAMESPACE
1449 } // namespace hwy
#define HWY_MAX(a, b)
Definition: base.h:128
#define HWY_RESTRICT
Definition: base.h:63
#define HWY_API
Definition: base.h:122
#define HWY_MIN(a, b)
Definition: base.h:127
#define HWY_INLINE
Definition: base.h:64
#define HWY_DASSERT(condition)
Definition: base.h:193
Definition: scalar-inl.h:68
Raw bits
Definition: scalar-inl.h:78
hwy::MakeUnsigned< T > Raw
Definition: scalar-inl.h:69
static HWY_INLINE Mask1< T > FromBool(bool b)
Definition: scalar-inl.h:72
HWY_API Mask1< T > operator<=(const Vec1< T > a, const Vec1< T > b)
Definition: scalar-inl.h:842
d
Definition: rvv-inl.h:1656
HWY_API Vec128< T, N > CopySign(const Vec128< T, N > magn, const Vec128< T, N > sign)
Definition: arm_neon-inl.h:1648
HWY_API Vec128< T, N > OddEvenBlocks(Vec128< T, N >, Vec128< T, N > even)
Definition: arm_neon-inl.h:4038
HWY_API Vec< D > SignBit(D d)
Definition: generic_ops-inl.h:61
HWY_API Mask128< TTo, N > RebindMask(Simd< TTo, N, 0 > dto, Mask128< TFrom, N > m)
Definition: arm_neon-inl.h:1688
HWY_API Vec128< float > ApproximateReciprocal(const Vec128< float > v)
Definition: arm_neon-inl.h:1225
HWY_API VFromD< DW > ZipLower(V a, V b)
Definition: arm_neon-inl.h:3672
HWY_API uint8_t GetLane(const Vec128< uint8_t, 16 > v)
Definition: arm_neon-inl.h:767
HWY_API Vec128< T, N > PopulationCount(Vec128< T, N > v)
Definition: arm_neon-inl.h:1595
HWY_API auto Lt(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:5252
HWY_API Vec128< float, N > Round(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2878
HWY_API auto Eq(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:5244
HWY_API Vec128< T, N > ShiftLeftSame(const Vec128< T, N > v, int bits)
Definition: arm_neon-inl.h:1122
HWY_API auto Gt(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:5257
HWY_API Mask128< T, N > FirstN(const Simd< T, N, 0 > d, size_t num)
Definition: arm_neon-inl.h:1896
HWY_API size_t StoreMaskBits(Simd< T, N, 0 >, const Mask128< T, N > mask, uint8_t *bits)
Definition: arm_neon-inl.h:4761
HWY_API Vec128< float, N > MulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition: arm_neon-inl.h:1290
HWY_API void Stream(const Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2416
HWY_API Vec128< int8_t > Abs(const Vec128< int8_t > v)
Definition: arm_neon-inl.h:1604
HWY_API bool AllTrue(const Simd< T, N, 0 > d, const Mask128< T, N > m)
Definition: arm_neon-inl.h:4790
HWY_API Vec128< T, N > SumOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4437
HWY_API Vec128< T, N > BroadcastSignBit(const Vec128< T, N > v)
Definition: arm_neon-inl.h:1665
V Shl(V a, V b)
Definition: arm_neon-inl.h:5235
HWY_API auto Ge(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:5261
HWY_API Vec128< float > AbsDiff(const Vec128< float > a, const Vec128< float > b)
Definition: arm_neon-inl.h:1264
StoreInterleaved3
Definition: rvv-inl.h:1405
HWY_API Vec128< uint64_t, N > Min(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:1957
HWY_API Vec128< uint64_t, N > Max(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:1995
HWY_API Mask128< T, N > MaskFromVec(const Vec128< T, N > v)
Definition: arm_neon-inl.h:1675
HWY_API Vec128< T, N > GatherIndex(const Simd< T, N, 0 > d, const T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition: arm_neon-inl.h:4284
HWY_API Vec128< uint8_t > operator<<(const Vec128< uint8_t > v, const Vec128< uint8_t > bits)
Definition: arm_neon-inl.h:953
HWY_API Mask128< T, N > operator>(Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:1885
HWY_API Vec1< uint8_t > SaturatedAdd(const Vec1< uint8_t > a, const Vec1< uint8_t > b)
Definition: scalar-inl.h:457
sseg3 sseg3 StoreInterleaved4
Definition: rvv-inl.h:1428
HWY_API Vec128< T, N > Load(Simd< T, N, 0 > d, const T *HWY_RESTRICT p)
Definition: arm_neon-inl.h:2205
HWY_API Vec128< int64_t > Neg(const Vec128< int64_t > v)
Definition: arm_neon-inl.h:904
HWY_API Vec128< T, N > Zero(Simd< T, N, 0 > d)
Definition: arm_neon-inl.h:733
HWY_API Vec128< T, N > operator&(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1513
HWY_API Vec128< T, N > operator|(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1518
HWY_API Vec128< int64_t > MulEven(Vec128< int32_t > a, Vec128< int32_t > b)
Definition: arm_neon-inl.h:4119
HWY_API Vec128< T, N > MaskedLoad(Mask128< T, N > m, Simd< T, N, 0 > d, const T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2210
HWY_API Vec128< T, N/2 > LowerHalf(const Vec128< T, N > v)
Definition: arm_neon-inl.h:2952
HWY_API Mask1< T > operator==(const Vec1< T > a, const Vec1< T > b)
Definition: scalar-inl.h:817
HWY_API Vec32< uint8_t > U8FromU32(const Vec128< uint32_t > v)
Definition: arm_neon-inl.h:2748
HWY_API Vec128< float, N > ReorderWidenMulAccumulate(Simd< float, N, 0 > df32, Vec128< bfloat16_t, 2 *N > a, Vec128< bfloat16_t, 2 *N > b, const Vec128< float, N > sum0, Vec128< float, N > &sum1)
Definition: arm_neon-inl.h:3688
HWY_API Vec128< T, N > IfVecThenElse(Vec128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition: arm_neon-inl.h:1505
HWY_API Vec128< T, N > operator^(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1523
HWY_API void BlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT p)
Definition: arm_neon-inl.h:2402
HWY_API size_t CountTrue(Full128< T >, const Mask128< T > mask)
Definition: arm_neon-inl.h:4742
HWY_API Vec128< T, N > VecFromMask(Simd< T, N, 0 > d, const Mask128< T, N > v)
Definition: arm_neon-inl.h:1681
HWY_API intptr_t FindFirstTrue(const Simd< T, N, 0 >, const Mask128< T, N > mask)
Definition: arm_neon-inl.h:4753
HWY_API Vec128< float > ConvertTo(Full128< float >, const Vec128< int32_t > v)
Definition: arm_neon-inl.h:2788
HWY_API Vec1< T > operator+(Vec1< T > a, Vec1< T > b)
Definition: scalar-inl.h:421
HWY_API Vec128< T, N > DupEven(Vec128< T, N > v)
Definition: arm_neon-inl.h:3987
HWY_API Vec128< T, N > IfThenElseZero(const Mask128< T, N > mask, const Vec128< T, N > yes)
Definition: arm_neon-inl.h:1711
HWY_API V Add(V a, V b)
Definition: arm_neon-inl.h:5217
HWY_API bool AllFalse(const Full128< T > d, const Mask128< T > m)
Definition: arm_neon-inl.h:4771
HWY_API Vec128< T, N > TableLookupLanes(Vec128< T, N > v, Indices128< T, N > idx)
Definition: arm_neon-inl.h:3419
HWY_API Vec128< T, N > Reverse4(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3490
HWY_API Vec128< float, N > Floor(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2909
HWY_API Vec128< float, N > MulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition: arm_neon-inl.h:1344
HWY_API Vec128< T, N > CopySignToAbs(const Vec128< T, N > abs, const Vec128< T, N > sign)
Definition: arm_neon-inl.h:1656
HWY_API Vec128< T, N > ZeroIfNegative(Vec128< T, N > v)
Definition: arm_neon-inl.h:1735
HWY_API void StoreU(const Vec128< uint8_t > v, Full128< uint8_t >, uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:2224
HWY_API Mask128< T, N > operator!=(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1815
HWY_API Vec128< float, N > Ceil(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2895
HWY_API Indices128< T, N > IndicesFromVec(Simd< T, N, 0 > d, Vec128< TI, N > vec)
Definition: arm_neon-inl.h:3373
HWY_API Vec128< T, N > SwapAdjacentBlocks(Vec128< T, N > v)
Definition: arm_neon-inl.h:4045
HWY_API Vec128< T, N > Reverse2(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3461
svuint16_t Set(Simd< bfloat16_t, N, kPow2 > d, bfloat16_t arg)
Definition: arm_sve-inl.h:282
HWY_API Vec128< T, N > Reverse8(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3513
HWY_API Vec128< T, N > MaxOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4445
HWY_API Mask1< T > operator<(const Vec1< T > a, const Vec1< T > b)
Definition: scalar-inl.h:833
Vec128< T, N > Iota(const Simd< T, N, 0 > d, const T2 first)
Definition: arm_neon-inl.h:757
HWY_API Vec1< uint8_t > AverageRound(const Vec1< uint8_t > a, const Vec1< uint8_t > b)
Definition: scalar-inl.h:510
HWY_API Vec1< T > ShiftRight(const Vec1< T > v)
Definition: scalar-inl.h:345
HWY_API Mask128< T, N > LoadMaskBits(Simd< T, N, 0 > d, const uint8_t *HWY_RESTRICT bits)
Definition: arm_neon-inl.h:4510
HWY_API Mask128< uint64_t, N > TestBit(Vec128< uint64_t, N > v, Vec128< uint64_t, N > bit)
Definition: arm_neon-inl.h:1917
HWY_API Vec128< uint8_t > LoadU(Full128< uint8_t >, const uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:2031
HWY_API Vec128< int16_t > MulHigh(const Vec128< int16_t > a, const Vec128< int16_t > b)
Definition: arm_neon-inl.h:1175
HWY_API Vec1< uint8_t > SaturatedSub(const Vec1< uint8_t > a, const Vec1< uint8_t > b)
Definition: scalar-inl.h:484
HWY_API Vec128< float, N > operator/(const Vec128< float, N > a, const Vec128< float, N > b)
Definition: arm_neon-inl.h:1252
HWY_API Vec128< T, N > OrAnd(Vec128< T, N > o, Vec128< T, N > a1, Vec128< T, N > a2)
Definition: arm_neon-inl.h:1498
HWY_API Vec128< T, N > IfNegativeThenElse(Vec128< T, N > v, Vec128< T, N > yes, Vec128< T, N > no)
Definition: arm_neon-inl.h:1724
HWY_API Vec128< T, N > And(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1440
HWY_API Vec128< T, N > BitCast(Simd< T, N, 0 > d, Vec128< FromT, N *sizeof(T)/sizeof(FromT)> v)
Definition: arm_neon-inl.h:710
HWY_API Vec128< int16_t > MulFixedPoint15(Vec128< int16_t > a, Vec128< int16_t > b)
Definition: arm_neon-inl.h:1211
HWY_API void ScatterOffset(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition: arm_neon-inl.h:4231
HWY_API V Sub(V a, V b)
Definition: arm_neon-inl.h:5221
HWY_API Vec128< T, N > Undefined(Simd< T, N, 0 >)
Definition: arm_neon-inl.h:747
HWY_API Mask128< T, N > operator>=(Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:1889
HWY_API size_t CompressBitsStore(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:5077
HWY_API Vec128< uint16_t > operator*(const Vec128< uint16_t > a, const Vec128< uint16_t > b)
Definition: arm_neon-inl.h:1133
HWY_API Vec128< T, N > GatherOffset(const Simd< T, N, 0 > d, const T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition: arm_neon-inl.h:4267
HWY_API size_t CompressBlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:5061
HWY_API Vec128< T, N > IfThenZeroElse(const Mask128< T, N > mask, const Vec128< T, N > no)
Definition: arm_neon-inl.h:1718
HWY_API Vec128< T, N > Xor(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1489
V Ceiling(const V v)
Definition: scalar-inl.h:736
HWY_API Vec1< T > ShiftLeft(const Vec1< T > v)
Definition: scalar-inl.h:339
HWY_API auto Le(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:5266
HWY_API Vec128< T, N > ShiftRightSame(const Vec128< T, N > v, int bits)
Definition: arm_neon-inl.h:1126
decltype(detail::DeduceD()(V())) DFromV
Definition: arm_neon-inl.h:555
HWY_API Vec128< int32_t, N > NearestInt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2939
HWY_API Indices128< T, N > SetTableIndices(Simd< T, N, 0 > d, const TI *idx)
Definition: arm_neon-inl.h:3413
HWY_API void ScatterIndex(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition: arm_neon-inl.h:4249
HWY_API Vec128< T > Not(const Vec128< T > v)
Definition: arm_neon-inl.h:1422
HWY_API Vec128< float, N > NegMulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition: arm_neon-inl.h:1323
V Shr(V a, V b)
Definition: arm_neon-inl.h:5239
HWY_API Vec128< T, N > LoadDup128(Simd< T, N, 0 > d, const T *const HWY_RESTRICT p)
Definition: arm_neon-inl.h:2217
HWY_API Vec128< T, N > OddEven(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:4019
HWY_API Vec128< uint16_t > Broadcast(const Vec128< uint16_t > v)
Definition: arm_neon-inl.h:3285
HWY_API Vec128< float, N > Trunc(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2867
HWY_API Vec128< T, N > MinOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4441
HWY_API auto Ne(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:5248
HWY_API Vec128< float, N > Sqrt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:1404
HWY_API Vec64< uint16_t > DemoteTo(Full64< uint16_t >, const Vec128< int32_t > v)
Definition: arm_neon-inl.h:2606
HWY_API Vec128< TI > TableLookupBytes(const Vec128< T > bytes, const Vec128< TI > from)
Definition: arm_neon-inl.h:4169
HWY_API size_t CompressStore(Vec128< T, N > v, const Mask128< T, N > mask, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:5052
HWY_API Vec128< uint32_t, N > RotateRight(const Vec128< uint32_t, N > v)
Definition: arm_neon-inl.h:935
HWY_API Vec128< T, N > AndNot(const Vec128< T, N > not_mask, const Vec128< T, N > mask)
Definition: arm_neon-inl.h:1455
HWY_API Vec128< T > ReverseBlocks(Full128< T >, const Vec128< T > v)
Definition: arm_neon-inl.h:4053
HWY_API V Div(V a, V b)
Definition: arm_neon-inl.h:5230
HWY_API Vec128< uint8_t > operator>>(const Vec128< uint8_t > v, const Vec128< uint8_t > bits)
Definition: arm_neon-inl.h:1033
HWY_API Vec128< uint64_t > SumsOf8(const Vec128< uint8_t > v)
Definition: arm_neon-inl.h:852
HWY_API V Mul(V a, V b)
Definition: arm_neon-inl.h:5226
HWY_API Vec128< T, 1 > Reverse(Simd< T, 1, 0 >, const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:3430
HWY_API void Store(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2397
HWY_API Vec128< uint16_t > PromoteTo(Full128< uint16_t >, const Vec64< uint8_t > v)
Definition: arm_neon-inl.h:2426
TFromD< DFromV< V > > TFromV
Definition: arm_neon-inl.h:558
HWY_API Vec128< float > ApproximateReciprocalSqrt(const Vec128< float > v)
Definition: arm_neon-inl.h:1376
const vfloat64m1_t v
Definition: rvv-inl.h:1656
HWY_API Vec128< T, N > Compress(Vec128< T, N > v, const Mask128< T, N > mask)
Definition: arm_neon-inl.h:5031
HWY_API Vec1< T > operator-(Vec1< T > a, Vec1< T > b)
Definition: scalar-inl.h:434
HWY_API VI TableLookupBytesOr0(const V bytes, const VI from)
Definition: arm_neon-inl.h:4224
HWY_API Vec128< T, N > Or(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1477
HWY_API Vec128< float, N > NegMulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition: arm_neon-inl.h:1352
HWY_API Vec1< T > IfThenElse(const Mask1< T > mask, const Vec1< T > yes, const Vec1< T > no)
Definition: scalar-inl.h:278
Definition: aligned_allocator.h:27
HWY_API void CopyBytes(const From *from, To *to)
Definition: base.h:732
HWY_API float F32FromBF16(bfloat16_t bf)
Definition: base.h:746
HWY_API bfloat16_t BF16FromF32(float f)
Definition: base.h:754
constexpr float HighestValue< float >()
Definition: base.h:529
HWY_API size_t PopCount(uint64_t x)
Definition: base.h:667
constexpr float LowestValue< float >()
Definition: base.h:516
typename detail::Relations< T >::Unsigned MakeUnsigned
Definition: base.h:452
typename detail::Relations< T >::Signed MakeSigned
Definition: base.h:454
HWY_AFTER_NAMESPACE()
HWY_BEFORE_NAMESPACE()
#define HWY_NAMESPACE
Definition: set_macros-inl.h:80
@ value
Definition: arm_neon-inl.h:4798
Definition: scalar-inl.h:1159
MakeSigned< T > raw
Definition: scalar-inl.h:1160
Definition: ops/shared-inl.h:40
Definition: scalar-inl.h:35
T raw
Definition: scalar-inl.h:63
Vec1 & operator=(const Vec1 &)=default
HWY_INLINE Vec1 & operator*=(const Vec1 other)
Definition: scalar-inl.h:41
HWY_INLINE Vec1 & operator-=(const Vec1 other)
Definition: scalar-inl.h:50
HWY_INLINE Vec1 & operator+=(const Vec1 other)
Definition: scalar-inl.h:47
Vec1(const Vec1 &)=default
HWY_INLINE Vec1()=default
HWY_INLINE Vec1 & operator/=(const Vec1 other)
Definition: scalar-inl.h:44
HWY_INLINE Vec1 & operator&=(const Vec1 other)
Definition: scalar-inl.h:53
HWY_INLINE Vec1(const T t)
Definition: scalar-inl.h:39
HWY_INLINE Vec1 & operator^=(const Vec1 other)
Definition: scalar-inl.h:59
HWY_INLINE Vec1 & operator|=(const Vec1 other)
Definition: scalar-inl.h:56
Definition: scalar-inl.h:84
Sisd< T > operator()(Vec1< T >) const
Definition: scalar-inl.h:86