Grok  9.7.5
base.h
Go to the documentation of this file.
1 // Copyright 2020 Google LLC
2 // SPDX-License-Identifier: Apache-2.0
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 // http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 
16 #ifndef HIGHWAY_HWY_BASE_H_
17 #define HIGHWAY_HWY_BASE_H_
18 
19 // For SIMD module implementations and their callers, target-independent.
20 
21 #include <stddef.h>
22 #include <stdint.h>
23 
24 #include <cfloat>
25 
27 #include "hwy/highway_export.h"
28 
29 #if HWY_ARCH_X86
30 #include <atomic>
31 #endif
32 
33 //------------------------------------------------------------------------------
34 // Compiler-specific definitions
35 
36 #define HWY_STR_IMPL(macro) #macro
37 #define HWY_STR(macro) HWY_STR_IMPL(macro)
38 
39 #if HWY_COMPILER_MSVC
40 
41 #include <intrin.h>
42 
43 #define HWY_RESTRICT __restrict
44 #define HWY_INLINE __forceinline
45 #define HWY_NOINLINE __declspec(noinline)
46 #define HWY_FLATTEN
47 #define HWY_NORETURN __declspec(noreturn)
48 #define HWY_LIKELY(expr) (expr)
49 #define HWY_UNLIKELY(expr) (expr)
50 #define HWY_PRAGMA(tokens) __pragma(tokens)
51 #define HWY_DIAGNOSTICS(tokens) HWY_PRAGMA(warning(tokens))
52 #define HWY_DIAGNOSTICS_OFF(msc, gcc) HWY_DIAGNOSTICS(msc)
53 #define HWY_MAYBE_UNUSED
54 #define HWY_HAS_ASSUME_ALIGNED 0
55 #if (_MSC_VER >= 1700)
56 #define HWY_MUST_USE_RESULT _Check_return_
57 #else
58 #define HWY_MUST_USE_RESULT
59 #endif
60 
61 #else
62 
63 #define HWY_RESTRICT __restrict__
64 #define HWY_INLINE inline __attribute__((always_inline))
65 #define HWY_NOINLINE __attribute__((noinline))
66 #define HWY_FLATTEN __attribute__((flatten))
67 #define HWY_NORETURN __attribute__((noreturn))
68 #define HWY_LIKELY(expr) __builtin_expect(!!(expr), 1)
69 #define HWY_UNLIKELY(expr) __builtin_expect(!!(expr), 0)
70 #define HWY_PRAGMA(tokens) _Pragma(#tokens)
71 #define HWY_DIAGNOSTICS(tokens) HWY_PRAGMA(GCC diagnostic tokens)
72 #define HWY_DIAGNOSTICS_OFF(msc, gcc) HWY_DIAGNOSTICS(gcc)
73 // Encountered "attribute list cannot appear here" when using the C++17
74 // [[maybe_unused]], so only use the old style attribute for now.
75 #define HWY_MAYBE_UNUSED __attribute__((unused))
76 #define HWY_MUST_USE_RESULT __attribute__((warn_unused_result))
77 
78 #endif // !HWY_COMPILER_MSVC
79 
80 //------------------------------------------------------------------------------
81 // Builtin/attributes
82 
83 // Enables error-checking of format strings.
84 #if HWY_HAS_ATTRIBUTE(__format__)
85 #define HWY_FORMAT(idx_fmt, idx_arg) \
86  __attribute__((__format__(__printf__, idx_fmt, idx_arg)))
87 #else
88 #define HWY_FORMAT(idx_fmt, idx_arg)
89 #endif
90 
91 // Returns a void* pointer which the compiler then assumes is N-byte aligned.
92 // Example: float* HWY_RESTRICT aligned = (float*)HWY_ASSUME_ALIGNED(in, 32);
93 //
94 // The assignment semantics are required by GCC/Clang. ICC provides an in-place
95 // __assume_aligned, whereas MSVC's __assume appears unsuitable.
96 #if HWY_HAS_BUILTIN(__builtin_assume_aligned)
97 #define HWY_ASSUME_ALIGNED(ptr, align) __builtin_assume_aligned((ptr), (align))
98 #else
99 #define HWY_ASSUME_ALIGNED(ptr, align) (ptr) /* not supported */
100 #endif
101 
102 // Clang and GCC require attributes on each function into which SIMD intrinsics
103 // are inlined. Support both per-function annotation (HWY_ATTR) for lambdas and
104 // automatic annotation via pragmas.
105 #if HWY_COMPILER_CLANG
106 #define HWY_PUSH_ATTRIBUTES(targets_str) \
107  HWY_PRAGMA(clang attribute push(__attribute__((target(targets_str))), \
108  apply_to = function))
109 #define HWY_POP_ATTRIBUTES HWY_PRAGMA(clang attribute pop)
110 #elif HWY_COMPILER_GCC
111 #define HWY_PUSH_ATTRIBUTES(targets_str) \
112  HWY_PRAGMA(GCC push_options) HWY_PRAGMA(GCC target targets_str)
113 #define HWY_POP_ATTRIBUTES HWY_PRAGMA(GCC pop_options)
114 #else
115 #define HWY_PUSH_ATTRIBUTES(targets_str)
116 #define HWY_POP_ATTRIBUTES
117 #endif
118 
119 //------------------------------------------------------------------------------
120 // Macros
121 
122 #define HWY_API static HWY_INLINE HWY_FLATTEN HWY_MAYBE_UNUSED
123 
124 #define HWY_CONCAT_IMPL(a, b) a##b
125 #define HWY_CONCAT(a, b) HWY_CONCAT_IMPL(a, b)
126 
127 #define HWY_MIN(a, b) ((a) < (b) ? (a) : (b))
128 #define HWY_MAX(a, b) ((a) > (b) ? (a) : (b))
129 
130 // Compile-time fence to prevent undesirable code reordering. On Clang x86, the
131 // typical asm volatile("" : : : "memory") has no effect, whereas atomic fence
132 // does, without generating code.
133 #if HWY_ARCH_X86
134 #define HWY_FENCE std::atomic_thread_fence(std::memory_order_acq_rel)
135 #else
136 // TODO(janwas): investigate alternatives. On ARM, the above generates barriers.
137 #define HWY_FENCE
138 #endif
139 
140 // 4 instances of a given literal value, useful as input to LoadDup128.
141 #define HWY_REP4(literal) literal, literal, literal, literal
142 
143 #define HWY_ABORT(format, ...) \
144  ::hwy::Abort(__FILE__, __LINE__, format, ##__VA_ARGS__)
145 
146 // Always enabled.
147 #define HWY_ASSERT(condition) \
148  do { \
149  if (!(condition)) { \
150  HWY_ABORT("Assert %s", #condition); \
151  } \
152  } while (0)
153 
154 #if HWY_HAS_FEATURE(memory_sanitizer) || defined(MEMORY_SANITIZER)
155 #define HWY_IS_MSAN 1
156 #else
157 #define HWY_IS_MSAN 0
158 #endif
159 
160 #if HWY_HAS_FEATURE(address_sanitizer) || defined(ADDRESS_SANITIZER)
161 #define HWY_IS_ASAN 1
162 #else
163 #define HWY_IS_ASAN 0
164 #endif
165 
166 #if HWY_HAS_FEATURE(thread_sanitizer) || defined(THREAD_SANITIZER)
167 #define HWY_IS_TSAN 1
168 #else
169 #define HWY_IS_TSAN 0
170 #endif
171 
172 // MSAN may cause lengthy build times or false positives e.g. in AVX3 DemoteTo.
173 // You can disable MSAN by adding this attribute to the function that fails.
174 #if HWY_IS_MSAN
175 #define HWY_ATTR_NO_MSAN __attribute__((no_sanitize_memory))
176 #else
177 #define HWY_ATTR_NO_MSAN
178 #endif
179 
180 // For enabling HWY_DASSERT and shortening tests in slower debug builds
181 #if !defined(HWY_IS_DEBUG_BUILD)
182 // Clang does not define NDEBUG, but it and GCC define __OPTIMIZE__, and recent
183 // MSVC defines NDEBUG (if not, could instead check _DEBUG).
184 #if (!defined(__OPTIMIZE__) && !defined(NDEBUG)) || HWY_IS_ASAN || \
185  HWY_IS_MSAN || HWY_IS_TSAN || defined(__clang_analyzer__)
186 #define HWY_IS_DEBUG_BUILD 1
187 #else
188 #define HWY_IS_DEBUG_BUILD 0
189 #endif
190 #endif // HWY_IS_DEBUG_BUILD
191 
192 #if HWY_IS_DEBUG_BUILD
193 #define HWY_DASSERT(condition) HWY_ASSERT(condition)
194 #else
195 #define HWY_DASSERT(condition) \
196  do { \
197  } while (0)
198 #endif
199 
200 namespace hwy {
201 
202 //------------------------------------------------------------------------------
203 // kMaxVectorSize (undocumented, pending removal)
204 
205 #if HWY_ARCH_X86
206 static constexpr HWY_MAYBE_UNUSED size_t kMaxVectorSize = 64; // AVX-512
207 #elif HWY_ARCH_RVV && defined(__riscv_vector)
208 // Not actually an upper bound on the size.
209 static constexpr HWY_MAYBE_UNUSED size_t kMaxVectorSize = 4096;
210 #else
211 static constexpr HWY_MAYBE_UNUSED size_t kMaxVectorSize = 16;
212 #endif
213 
214 //------------------------------------------------------------------------------
215 // Alignment
216 
217 // Potentially useful for LoadDup128 and capped vectors. In other cases, arrays
218 // should be allocated dynamically via aligned_allocator.h because Lanes() may
219 // exceed the stack size.
220 #if HWY_ARCH_X86
221 #define HWY_ALIGN_MAX alignas(64)
222 #elif HWY_ARCH_RVV && defined(__riscv_vector)
223 #define HWY_ALIGN_MAX alignas(8) // only elements need be aligned
224 #else
225 #define HWY_ALIGN_MAX alignas(16)
226 #endif
227 
228 //------------------------------------------------------------------------------
229 // Lane types
230 
231 // Match [u]int##_t naming scheme so rvv-inl.h macros can obtain the type name
232 // by concatenating base type and bits.
233 
234 #if HWY_ARCH_ARM && (__ARM_FP & 2)
235 #define HWY_NATIVE_FLOAT16 1
236 #else
237 #define HWY_NATIVE_FLOAT16 0
238 #endif
239 
240 #pragma pack(push, 1)
241 
242 #if HWY_NATIVE_FLOAT16
243 using float16_t = __fp16;
244 // Clang does not allow __fp16 arguments, but scalar.h requires LaneType
245 // arguments, so use a wrapper.
246 // TODO(janwas): replace with _Float16 when that is supported?
247 #else
248 struct float16_t {
249  uint16_t bits;
250 };
251 #endif
252 
253 struct bfloat16_t {
254  uint16_t bits;
255 };
256 
257 #pragma pack(pop)
258 
259 using float32_t = float;
260 using float64_t = double;
261 
262 //------------------------------------------------------------------------------
263 // Controlling overload resolution (SFINAE)
264 
265 template <bool Condition>
266 struct EnableIfT {};
267 template <>
268 struct EnableIfT<true> {
269  using type = void;
270 };
271 
272 template <bool Condition>
274 
275 template <typename T, typename U>
276 struct IsSameT {
277  enum { value = 0 };
278 };
279 
280 template <typename T>
281 struct IsSameT<T, T> {
282  enum { value = 1 };
283 };
284 
285 template <typename T, typename U>
286 HWY_API constexpr bool IsSame() {
287  return IsSameT<T, U>::value;
288 }
289 
290 // Insert into template/function arguments to enable this overload only for
291 // vectors of AT MOST this many bits.
292 //
293 // Note that enabling for exactly 128 bits is unnecessary because a function can
294 // simply be overloaded with Vec128<T> and/or Full128<T> tag. Enabling for other
295 // sizes (e.g. 64 bit) can be achieved via Simd<T, 8 / sizeof(T), 0>.
296 #define HWY_IF_LE128(T, N) hwy::EnableIf<N * sizeof(T) <= 16>* = nullptr
297 #define HWY_IF_LE64(T, N) hwy::EnableIf<N * sizeof(T) <= 8>* = nullptr
298 #define HWY_IF_LE32(T, N) hwy::EnableIf<N * sizeof(T) <= 4>* = nullptr
299 #define HWY_IF_GE32(T, N) hwy::EnableIf<N * sizeof(T) >= 4>* = nullptr
300 #define HWY_IF_GE64(T, N) hwy::EnableIf<N * sizeof(T) >= 8>* = nullptr
301 #define HWY_IF_GE128(T, N) hwy::EnableIf<N * sizeof(T) >= 16>* = nullptr
302 #define HWY_IF_GT128(T, N) hwy::EnableIf<(N * sizeof(T) > 16)>* = nullptr
303 
304 #define HWY_IF_UNSIGNED(T) hwy::EnableIf<!IsSigned<T>()>* = nullptr
305 #define HWY_IF_SIGNED(T) \
306  hwy::EnableIf<IsSigned<T>() && !IsFloat<T>()>* = nullptr
307 #define HWY_IF_FLOAT(T) hwy::EnableIf<hwy::IsFloat<T>()>* = nullptr
308 #define HWY_IF_NOT_FLOAT(T) hwy::EnableIf<!hwy::IsFloat<T>()>* = nullptr
309 
310 #define HWY_IF_LANE_SIZE(T, bytes) \
311  hwy::EnableIf<sizeof(T) == (bytes)>* = nullptr
312 #define HWY_IF_NOT_LANE_SIZE(T, bytes) \
313  hwy::EnableIf<sizeof(T) != (bytes)>* = nullptr
314 
315 // Empty struct used as a size tag type.
316 template <size_t N>
317 struct SizeTag {};
318 
319 template <class T>
320 struct RemoveConstT {
321  using type = T;
322 };
323 template <class T>
324 struct RemoveConstT<const T> {
325  using type = T;
326 };
327 
328 template <class T>
330 
331 //------------------------------------------------------------------------------
332 // Type relations
333 
334 namespace detail {
335 
336 template <typename T>
337 struct Relations;
338 template <>
339 struct Relations<uint8_t> {
340  using Unsigned = uint8_t;
341  using Signed = int8_t;
342  using Wide = uint16_t;
343 };
344 template <>
345 struct Relations<int8_t> {
346  using Unsigned = uint8_t;
347  using Signed = int8_t;
348  using Wide = int16_t;
349 };
350 template <>
351 struct Relations<uint16_t> {
352  using Unsigned = uint16_t;
353  using Signed = int16_t;
354  using Wide = uint32_t;
355  using Narrow = uint8_t;
356 };
357 template <>
358 struct Relations<int16_t> {
359  using Unsigned = uint16_t;
360  using Signed = int16_t;
361  using Wide = int32_t;
362  using Narrow = int8_t;
363 };
364 template <>
365 struct Relations<uint32_t> {
366  using Unsigned = uint32_t;
367  using Signed = int32_t;
368  using Float = float;
369  using Wide = uint64_t;
370  using Narrow = uint16_t;
371 };
372 template <>
373 struct Relations<int32_t> {
374  using Unsigned = uint32_t;
375  using Signed = int32_t;
376  using Float = float;
377  using Wide = int64_t;
378  using Narrow = int16_t;
379 };
380 template <>
381 struct Relations<uint64_t> {
382  using Unsigned = uint64_t;
383  using Signed = int64_t;
384  using Float = double;
385  using Narrow = uint32_t;
386 };
387 template <>
388 struct Relations<int64_t> {
389  using Unsigned = uint64_t;
390  using Signed = int64_t;
391  using Float = double;
392  using Narrow = int32_t;
393 };
394 template <>
396  using Unsigned = uint16_t;
397  using Signed = int16_t;
398  using Float = float16_t;
399  using Wide = float;
400 };
401 template <>
403  using Unsigned = uint16_t;
404  using Signed = int16_t;
405  using Wide = float;
406 };
407 template <>
408 struct Relations<float> {
409  using Unsigned = uint32_t;
410  using Signed = int32_t;
411  using Float = float;
412  using Wide = double;
413  using Narrow = float16_t;
414 };
415 template <>
416 struct Relations<double> {
417  using Unsigned = uint64_t;
418  using Signed = int64_t;
419  using Float = double;
420  using Narrow = float;
421 };
422 
423 template <size_t N>
425 template <>
426 struct TypeFromSize<1> {
427  using Unsigned = uint8_t;
428  using Signed = int8_t;
429 };
430 template <>
431 struct TypeFromSize<2> {
432  using Unsigned = uint16_t;
433  using Signed = int16_t;
434 };
435 template <>
436 struct TypeFromSize<4> {
437  using Unsigned = uint32_t;
438  using Signed = int32_t;
439  using Float = float;
440 };
441 template <>
442 struct TypeFromSize<8> {
443  using Unsigned = uint64_t;
444  using Signed = int64_t;
445  using Float = double;
446 };
447 
448 } // namespace detail
449 
450 // Aliases for types of a different category, but the same size.
451 template <typename T>
453 template <typename T>
455 template <typename T>
457 
458 // Aliases for types of the same category, but different size.
459 template <typename T>
461 template <typename T>
463 
464 // Obtain type from its size [bytes].
465 template <size_t N>
467 template <size_t N>
469 template <size_t N>
471 
472 //------------------------------------------------------------------------------
473 // Type traits
474 
475 template <typename T>
476 HWY_API constexpr bool IsFloat() {
477  // Cannot use T(1.25) != T(1) for float16_t, which can only be converted to or
478  // from a float, not compared.
479  return IsSame<T, float>() || IsSame<T, double>();
480 }
481 
482 template <typename T>
483 HWY_API constexpr bool IsSigned() {
484  return T(0) > T(-1);
485 }
486 template <>
487 constexpr bool IsSigned<float16_t>() {
488  return true;
489 }
490 template <>
491 constexpr bool IsSigned<bfloat16_t>() {
492  return true;
493 }
494 
495 // Largest/smallest representable integer values.
496 template <typename T>
497 HWY_API constexpr T LimitsMax() {
498  static_assert(!IsFloat<T>(), "Only for integer types");
499  using TU = MakeUnsigned<T>;
500  return static_cast<T>(IsSigned<T>() ? (static_cast<TU>(~0ull) >> 1)
501  : static_cast<TU>(~0ull));
502 }
503 template <typename T>
504 HWY_API constexpr T LimitsMin() {
505  static_assert(!IsFloat<T>(), "Only for integer types");
506  return IsSigned<T>() ? T(-1) - LimitsMax<T>() : T(0);
507 }
508 
509 // Largest/smallest representable value (integer or float). This naming avoids
510 // confusion with numeric_limits<float>::min() (the smallest positive value).
511 template <typename T>
512 HWY_API constexpr T LowestValue() {
513  return LimitsMin<T>();
514 }
515 template <>
516 constexpr float LowestValue<float>() {
517  return -FLT_MAX;
518 }
519 template <>
520 constexpr double LowestValue<double>() {
521  return -DBL_MAX;
522 }
523 
524 template <typename T>
525 HWY_API constexpr T HighestValue() {
526  return LimitsMax<T>();
527 }
528 template <>
529 constexpr float HighestValue<float>() {
530  return FLT_MAX;
531 }
532 template <>
533 constexpr double HighestValue<double>() {
534  return DBL_MAX;
535 }
536 
537 // Returns bitmask of the exponent field in IEEE binary32/64.
538 template <typename T>
539 constexpr T ExponentMask() {
540  static_assert(sizeof(T) == 0, "Only instantiate the specializations");
541  return 0;
542 }
543 template <>
544 constexpr uint32_t ExponentMask<uint32_t>() {
545  return 0x7F800000;
546 }
547 template <>
548 constexpr uint64_t ExponentMask<uint64_t>() {
549  return 0x7FF0000000000000ULL;
550 }
551 
552 // Returns bitmask of the mantissa field in IEEE binary32/64.
553 template <typename T>
554 constexpr T MantissaMask() {
555  static_assert(sizeof(T) == 0, "Only instantiate the specializations");
556  return 0;
557 }
558 template <>
559 constexpr uint32_t MantissaMask<uint32_t>() {
560  return 0x007FFFFF;
561 }
562 template <>
563 constexpr uint64_t MantissaMask<uint64_t>() {
564  return 0x000FFFFFFFFFFFFFULL;
565 }
566 
567 // Returns 1 << mantissa_bits as a floating-point number. All integers whose
568 // absolute value are less than this can be represented exactly.
569 template <typename T>
570 constexpr T MantissaEnd() {
571  static_assert(sizeof(T) == 0, "Only instantiate the specializations");
572  return 0;
573 }
574 template <>
575 constexpr float MantissaEnd<float>() {
576  return 8388608.0f; // 1 << 23
577 }
578 template <>
579 constexpr double MantissaEnd<double>() {
580  // floating point literal with p52 requires C++17.
581  return 4503599627370496.0; // 1 << 52
582 }
583 
584 //------------------------------------------------------------------------------
585 // Helper functions
586 
587 template <typename T1, typename T2>
588 constexpr inline T1 DivCeil(T1 a, T2 b) {
589  return (a + b - 1) / b;
590 }
591 
592 // Works for any `align`; if a power of two, compiler emits ADD+AND.
593 constexpr inline size_t RoundUpTo(size_t what, size_t align) {
594  return DivCeil(what, align) * align;
595 }
596 
597 // Undefined results for x == 0.
598 HWY_API size_t Num0BitsBelowLS1Bit_Nonzero32(const uint32_t x) {
599 #if HWY_COMPILER_MSVC
600  unsigned long index; // NOLINT
601  _BitScanForward(&index, x);
602  return index;
603 #else // HWY_COMPILER_MSVC
604  return static_cast<size_t>(__builtin_ctz(x));
605 #endif // HWY_COMPILER_MSVC
606 }
607 
608 HWY_API size_t Num0BitsBelowLS1Bit_Nonzero64(const uint64_t x) {
609 #if HWY_COMPILER_MSVC
610 #if HWY_ARCH_X86_64
611  unsigned long index; // NOLINT
612  _BitScanForward64(&index, x);
613  return index;
614 #else // HWY_ARCH_X86_64
615  // _BitScanForward64 not available
616  uint32_t lsb = static_cast<uint32_t>(x & 0xFFFFFFFF);
617  unsigned long index;
618  if (lsb == 0) {
619  uint32_t msb = static_cast<uint32_t>(x >> 32u);
620  _BitScanForward(&index, msb);
621  return 32 + index;
622  } else {
623  _BitScanForward(&index, lsb);
624  return index;
625  }
626 #endif // HWY_ARCH_X86_64
627 #else // HWY_COMPILER_MSVC
628  return static_cast<size_t>(__builtin_ctzll(x));
629 #endif // HWY_COMPILER_MSVC
630 }
631 
632 // Undefined results for x == 0.
633 HWY_API size_t Num0BitsAboveMS1Bit_Nonzero32(const uint32_t x) {
634 #if HWY_COMPILER_MSVC
635  unsigned long index; // NOLINT
636  _BitScanReverse(&index, x);
637  return 31 - index;
638 #else // HWY_COMPILER_MSVC
639  return static_cast<size_t>(__builtin_clz(x));
640 #endif // HWY_COMPILER_MSVC
641 }
642 
643 HWY_API size_t Num0BitsAboveMS1Bit_Nonzero64(const uint64_t x) {
644 #if HWY_COMPILER_MSVC
645 #if HWY_ARCH_X86_64
646  unsigned long index; // NOLINT
647  _BitScanReverse64(&index, x);
648  return 63 - index;
649 #else // HWY_ARCH_X86_64
650  // _BitScanReverse64 not available
651  const uint32_t msb = static_cast<uint32_t>(x >> 32u);
652  unsigned long index;
653  if (msb == 0) {
654  const uint32_t lsb = static_cast<uint32_t>(x & 0xFFFFFFFF);
655  _BitScanReverse(&index, lsb);
656  return 63 - index;
657  } else {
658  _BitScanReverse(&index, msb);
659  return 31 - index;
660  }
661 #endif // HWY_ARCH_X86_64
662 #else // HWY_COMPILER_MSVC
663  return static_cast<size_t>(__builtin_clzll(x));
664 #endif // HWY_COMPILER_MSVC
665 }
666 
667 HWY_API size_t PopCount(uint64_t x) {
668 #if HWY_COMPILER_CLANG || HWY_COMPILER_GCC
669  return static_cast<size_t>(__builtin_popcountll(x));
670  // This instruction has a separate feature flag, but is often called from
671  // non-SIMD code, so we don't want to require dynamic dispatch. It was first
672  // supported by Intel in Nehalem (SSE4.2), but MSVC only predefines a macro
673  // for AVX, so check for that.
674 #elif HWY_COMPILER_MSVC && HWY_ARCH_X86_64 && defined(__AVX__)
675  return _mm_popcnt_u64(x);
676 #elif HWY_COMPILER_MSVC && HWY_ARCH_X86_32 && defined(__AVX__)
677  return _mm_popcnt_u32(uint32_t(x)) + _mm_popcnt_u32(uint32_t(x >> 32));
678 #else
679  x -= ((x >> 1) & 0x5555555555555555ULL);
680  x = (((x >> 2) & 0x3333333333333333ULL) + (x & 0x3333333333333333ULL));
681  x = (((x >> 4) + x) & 0x0F0F0F0F0F0F0F0FULL);
682  x += (x >> 8);
683  x += (x >> 16);
684  x += (x >> 32);
685  return static_cast<size_t>(x & 0x7Fu);
686 #endif
687 }
688 
689 // Skip HWY_API due to GCC "function not considered for inlining". Previously
690 // such errors were caused by underlying type mismatches, but it's not clear
691 // what is still mismatched despite all the casts.
692 template <typename TI>
693 /*HWY_API*/ constexpr size_t FloorLog2(TI x) {
694  return x == TI{1}
695  ? 0
696  : static_cast<size_t>(FloorLog2(static_cast<TI>(x >> 1)) + 1);
697 }
698 
699 template <typename TI>
700 /*HWY_API*/ constexpr size_t CeilLog2(TI x) {
701  return x == TI{1}
702  ? 0
703  : static_cast<size_t>(FloorLog2(static_cast<TI>(x - 1)) + 1);
704 }
705 
706 #if HWY_COMPILER_MSVC && HWY_ARCH_X86_64
707 #pragma intrinsic(_umul128)
708 #endif
709 
710 // 64 x 64 = 128 bit multiplication
711 HWY_API uint64_t Mul128(uint64_t a, uint64_t b, uint64_t* HWY_RESTRICT upper) {
712 #if defined(__SIZEOF_INT128__)
713  __uint128_t product = (__uint128_t)a * (__uint128_t)b;
714  *upper = (uint64_t)(product >> 64);
715  return (uint64_t)(product & 0xFFFFFFFFFFFFFFFFULL);
716 #elif HWY_COMPILER_MSVC && HWY_ARCH_X86_64
717  return _umul128(a, b, upper);
718 #else
719  constexpr uint64_t kLo32 = 0xFFFFFFFFU;
720  const uint64_t lo_lo = (a & kLo32) * (b & kLo32);
721  const uint64_t hi_lo = (a >> 32) * (b & kLo32);
722  const uint64_t lo_hi = (a & kLo32) * (b >> 32);
723  const uint64_t hi_hi = (a >> 32) * (b >> 32);
724  const uint64_t t = (lo_lo >> 32) + (hi_lo & kLo32) + lo_hi;
725  *upper = (hi_lo >> 32) + (t >> 32) + hi_hi;
726  return (t << 32) | (lo_lo & kLo32);
727 #endif
728 }
729 
730 // The source/destination must not overlap/alias.
731 template <size_t kBytes, typename From, typename To>
732 HWY_API void CopyBytes(const From* from, To* to) {
733 #if HWY_COMPILER_MSVC
734  const uint8_t* HWY_RESTRICT from_bytes =
735  reinterpret_cast<const uint8_t*>(from);
736  uint8_t* HWY_RESTRICT to_bytes = reinterpret_cast<uint8_t*>(to);
737  for (size_t i = 0; i < kBytes; ++i) {
738  to_bytes[i] = from_bytes[i];
739  }
740 #else
741  // Avoids horrible codegen on Clang (series of PINSRB)
742  __builtin_memcpy(to, from, kBytes);
743 #endif
744 }
745 
747  uint32_t bits = bf.bits;
748  bits <<= 16;
749  float f;
750  CopyBytes<4>(&bits, &f);
751  return f;
752 }
753 
755  uint32_t bits;
756  CopyBytes<4>(&f, &bits);
757  bfloat16_t bf;
758  bf.bits = static_cast<uint16_t>(bits >> 16);
759  return bf;
760 }
761 
763  Abort(const char* file, int line, const char* format, ...);
764 
765 } // namespace hwy
766 
767 #endif // HIGHWAY_HWY_BASE_H_
#define HWY_RESTRICT
Definition: base.h:63
#define HWY_NORETURN
Definition: base.h:67
#define HWY_API
Definition: base.h:122
#define HWY_MAYBE_UNUSED
Definition: base.h:75
#define HWY_DLLEXPORT
Definition: highway_export.h:18
Definition: aligned_allocator.h:27
HWY_API void CopyBytes(const From *from, To *to)
Definition: base.h:732
constexpr T MantissaEnd()
Definition: base.h:570
HWY_API size_t Num0BitsBelowLS1Bit_Nonzero64(const uint64_t x)
Definition: base.h:608
HWY_API float F32FromBF16(bfloat16_t bf)
Definition: base.h:746
constexpr HWY_API T LimitsMin()
Definition: base.h:504
HWY_API uint64_t Mul128(uint64_t a, uint64_t b, uint64_t *HWY_RESTRICT upper)
Definition: base.h:711
HWY_API bfloat16_t BF16FromF32(float f)
Definition: base.h:754
typename detail::TypeFromSize< N >::Float FloatFromSize
Definition: base.h:470
typename RemoveConstT< T >::type RemoveConst
Definition: base.h:329
typename detail::TypeFromSize< N >::Unsigned UnsignedFromSize
Definition: base.h:466
constexpr float HighestValue< float >()
Definition: base.h:529
constexpr T ExponentMask()
Definition: base.h:539
constexpr HWY_API T LimitsMax()
Definition: base.h:497
typename detail::TypeFromSize< N >::Signed SignedFromSize
Definition: base.h:468
constexpr T1 DivCeil(T1 a, T2 b)
Definition: base.h:588
constexpr float MantissaEnd< float >()
Definition: base.h:575
double float64_t
Definition: base.h:260
constexpr bool IsSigned< bfloat16_t >()
Definition: base.h:491
constexpr size_t FloorLog2(TI x)
Definition: base.h:693
HWY_API size_t Num0BitsAboveMS1Bit_Nonzero32(const uint32_t x)
Definition: base.h:633
constexpr bool IsSigned< float16_t >()
Definition: base.h:487
constexpr double HighestValue< double >()
Definition: base.h:533
typename EnableIfT< Condition >::type EnableIf
Definition: base.h:273
constexpr HWY_API bool IsFloat()
Definition: base.h:476
float float32_t
Definition: base.h:259
HWY_API size_t PopCount(uint64_t x)
Definition: base.h:667
constexpr double MantissaEnd< double >()
Definition: base.h:579
constexpr uint64_t ExponentMask< uint64_t >()
Definition: base.h:548
HWY_API size_t Num0BitsBelowLS1Bit_Nonzero32(const uint32_t x)
Definition: base.h:598
constexpr float LowestValue< float >()
Definition: base.h:516
constexpr HWY_API bool IsSame()
Definition: base.h:286
constexpr size_t CeilLog2(TI x)
Definition: base.h:700
HWY_API size_t Num0BitsAboveMS1Bit_Nonzero64(const uint64_t x)
Definition: base.h:643
constexpr double LowestValue< double >()
Definition: base.h:520
constexpr uint32_t ExponentMask< uint32_t >()
Definition: base.h:544
constexpr HWY_API T LowestValue()
Definition: base.h:512
constexpr HWY_API T HighestValue()
Definition: base.h:525
static constexpr HWY_MAYBE_UNUSED size_t kMaxVectorSize
Definition: base.h:211
constexpr T MantissaMask()
Definition: base.h:554
constexpr HWY_API bool IsSigned()
Definition: base.h:483
typename detail::Relations< T >::Unsigned MakeUnsigned
Definition: base.h:452
constexpr uint32_t MantissaMask< uint32_t >()
Definition: base.h:559
typename detail::Relations< T >::Wide MakeWide
Definition: base.h:460
constexpr uint64_t MantissaMask< uint64_t >()
Definition: base.h:563
typename detail::Relations< T >::Float MakeFloat
Definition: base.h:456
typename detail::Relations< T >::Signed MakeSigned
Definition: base.h:454
HWY_DLLEXPORT HWY_NORETURN void HWY_FORMAT(3, 4) Abort(const char *file
HWY_DLLEXPORT HWY_NORETURN void int const char * format
Definition: base.h:763
HWY_DLLEXPORT HWY_NORETURN void int line
Definition: base.h:763
constexpr size_t RoundUpTo(size_t what, size_t align)
Definition: base.h:593
typename detail::Relations< T >::Narrow MakeNarrow
Definition: base.h:462
void type
Definition: base.h:269
Definition: base.h:266
Definition: base.h:276
@ value
Definition: base.h:277
T type
Definition: base.h:325
Definition: base.h:320
T type
Definition: base.h:321
Definition: base.h:317
Definition: base.h:253
uint16_t bits
Definition: base.h:254
int16_t Signed
Definition: base.h:404
float Wide
Definition: base.h:405
uint16_t Unsigned
Definition: base.h:403
double Float
Definition: base.h:419
uint64_t Unsigned
Definition: base.h:417
int64_t Signed
Definition: base.h:418
float Narrow
Definition: base.h:420
int16_t Signed
Definition: base.h:397
float Wide
Definition: base.h:399
uint16_t Unsigned
Definition: base.h:396
uint32_t Unsigned
Definition: base.h:409
double Wide
Definition: base.h:412
float Float
Definition: base.h:411
int32_t Signed
Definition: base.h:410
uint16_t Unsigned
Definition: base.h:359
int16_t Signed
Definition: base.h:360
int32_t Wide
Definition: base.h:361
int8_t Narrow
Definition: base.h:362
uint32_t Unsigned
Definition: base.h:374
int64_t Wide
Definition: base.h:377
float Float
Definition: base.h:376
int16_t Narrow
Definition: base.h:378
int32_t Signed
Definition: base.h:375
int32_t Narrow
Definition: base.h:392
double Float
Definition: base.h:391
uint64_t Unsigned
Definition: base.h:389
int64_t Signed
Definition: base.h:390
int16_t Wide
Definition: base.h:348
int8_t Signed
Definition: base.h:347
uint8_t Unsigned
Definition: base.h:346
uint8_t Narrow
Definition: base.h:355
int16_t Signed
Definition: base.h:353
uint32_t Wide
Definition: base.h:354
uint16_t Unsigned
Definition: base.h:352
uint32_t Unsigned
Definition: base.h:366
uint64_t Wide
Definition: base.h:369
uint16_t Narrow
Definition: base.h:370
float Float
Definition: base.h:368
int32_t Signed
Definition: base.h:367
uint32_t Narrow
Definition: base.h:385
int64_t Signed
Definition: base.h:383
uint64_t Unsigned
Definition: base.h:382
double Float
Definition: base.h:384
int8_t Signed
Definition: base.h:341
uint8_t Unsigned
Definition: base.h:340
uint16_t Wide
Definition: base.h:342
Definition: base.h:337
int8_t Signed
Definition: base.h:428
uint8_t Unsigned
Definition: base.h:427
int16_t Signed
Definition: base.h:433
uint16_t Unsigned
Definition: base.h:432
int32_t Signed
Definition: base.h:438
uint32_t Unsigned
Definition: base.h:437
float Float
Definition: base.h:439
double Float
Definition: base.h:445
int64_t Signed
Definition: base.h:444
uint64_t Unsigned
Definition: base.h:443
Definition: base.h:424
Definition: base.h:248
uint16_t bits
Definition: base.h:249