Grok  9.7.5
detect_targets.h
Go to the documentation of this file.
1 // Copyright 2021 Google LLC
2 // SPDX-License-Identifier: Apache-2.0
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 // http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 
16 #ifndef HIGHWAY_HWY_DETECT_TARGETS_H_
17 #define HIGHWAY_HWY_DETECT_TARGETS_H_
18 
19 // Defines targets and chooses which to enable.
20 
22 
23 //------------------------------------------------------------------------------
24 // Optional configuration
25 
26 // See ../quick_reference.md for documentation of these macros.
27 
28 // Uncomment to override the default baseline determined from predefined macros:
29 // #define HWY_BASELINE_TARGETS (HWY_SSE4 | HWY_SCALAR)
30 
31 // Uncomment to override the default blocklist:
32 // #define HWY_BROKEN_TARGETS HWY_AVX3
33 
34 // Uncomment to definitely avoid generating those target(s):
35 // #define HWY_DISABLED_TARGETS HWY_SSE4
36 
37 // Uncomment to avoid emitting BMI/BMI2/FMA instructions (allows generating
38 // AVX2 target for VMs which support AVX2 but not the other instruction sets)
39 // #define HWY_DISABLE_BMI2_FMA
40 
41 //------------------------------------------------------------------------------
42 // Targets
43 
44 // Unique bit value for each target. A lower value is "better" (e.g. more lanes)
45 // than a higher value within the same group/platform - see HWY_STATIC_TARGET.
46 //
47 // All values are unconditionally defined so we can test HWY_TARGETS without
48 // first checking the HWY_ARCH_*.
49 //
50 // The C99 preprocessor evaluates #if expressions using intmax_t types, so we
51 // can use 32-bit literals.
52 
53 // 1,2: reserved
54 
55 // Currently satisfiable by Ice Lake (VNNI, VPCLMULQDQ, VBMI2, VAES). Later to
56 // be added: BF16 (Cooper Lake). VP2INTERSECT is only in Tiger Lake? We do not
57 // yet have uses for VBMI, VPOPCNTDQ, BITALG, GFNI.
58 #define HWY_AVX3_DL 4 // see HWY_WANT_AVX3_DL below
59 #define HWY_AVX3 8
60 #define HWY_AVX2 16
61 // 32: reserved for AVX
62 #define HWY_SSE4 64
63 #define HWY_SSSE3 128
64 // 0x100, 0x200: reserved for SSE3, SSE2
65 
66 // The highest bit in the HWY_TARGETS mask that a x86 target can have. Used for
67 // dynamic dispatch. All x86 target bits must be lower or equal to
68 // (1 << HWY_HIGHEST_TARGET_BIT_X86) and they can only use
69 // HWY_MAX_DYNAMIC_TARGETS in total.
70 #define HWY_HIGHEST_TARGET_BIT_X86 9
71 
72 #define HWY_SVE2 0x400
73 #define HWY_SVE 0x800
74 // 0x1000 reserved for Helium
75 #define HWY_NEON 0x2000
76 
77 #define HWY_HIGHEST_TARGET_BIT_ARM 13
78 
79 // 0x4000, 0x8000 reserved
80 #define HWY_PPC8 0x10000 // v2.07 or 3
81 // 0x20000, 0x40000 reserved for prior VSX/AltiVec
82 
83 #define HWY_HIGHEST_TARGET_BIT_PPC 18
84 
85 #define HWY_WASM2 0x80000 // Experimental
86 #define HWY_WASM 0x100000
87 
88 #define HWY_HIGHEST_TARGET_BIT_WASM 20
89 
90 // 0x200000, 0x400000, 0x800000 reserved
91 
92 #define HWY_RVV 0x1000000
93 
94 #define HWY_HIGHEST_TARGET_BIT_RVV 24
95 
96 // 0x2000000, 0x4000000, 0x8000000, 0x10000000 reserved
97 
98 #define HWY_SCALAR 0x20000000
99 
100 #define HWY_HIGHEST_TARGET_BIT_SCALAR 29
101 
102 // Cannot use higher values, otherwise HWY_TARGETS computation might overflow.
103 
104 //------------------------------------------------------------------------------
105 // Set default blocklists
106 
107 // Disabled means excluded from enabled at user's request. A separate config
108 // macro allows disabling without deactivating the blocklist below.
109 #ifndef HWY_DISABLED_TARGETS
110 #define HWY_DISABLED_TARGETS 0
111 #endif
112 
113 // Broken means excluded from enabled due to known compiler issues. Allow the
114 // user to override this blocklist without any guarantee of success.
115 #ifndef HWY_BROKEN_TARGETS
116 
117 // x86 clang-6: we saw multiple AVX2/3 compile errors and in one case invalid
118 // SSE4 codegen (possibly only for msan), so disable all those targets.
119 #if HWY_ARCH_X86 && (HWY_COMPILER_CLANG != 0 && HWY_COMPILER_CLANG < 700)
120 #define HWY_BROKEN_TARGETS (HWY_SSE4 | HWY_AVX2 | HWY_AVX3 | HWY_AVX3_DL)
121 // This entails a major speed reduction, so warn unless the user explicitly
122 // opts in to scalar-only.
123 #if !defined(HWY_COMPILE_ONLY_SCALAR)
124 #pragma message("x86 Clang <= 6: define HWY_COMPILE_ONLY_SCALAR or upgrade.")
125 #endif
126 
127 // 32-bit may fail to compile AVX2/3.
128 #elif HWY_ARCH_X86_32
129 #define HWY_BROKEN_TARGETS (HWY_AVX2 | HWY_AVX3 | HWY_AVX3_DL)
130 
131 // MSVC AVX3 support is buggy: https://github.com/Mysticial/Flops/issues/16
132 #elif HWY_COMPILER_MSVC != 0
133 #define HWY_BROKEN_TARGETS (HWY_AVX3 | HWY_AVX3_DL)
134 
135 // armv7be has not been tested and is not yet supported.
136 #elif HWY_ARCH_ARM_V7 && \
137  (defined(__ARM_BIG_ENDIAN) || \
138  (defined(__BYTE_ORDER) && __BYTE_ORDER == __BIG_ENDIAN))
139 #define HWY_BROKEN_TARGETS (HWY_NEON)
140 
141 // SVE[2] require recent clang or gcc versions.
142 #elif (HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 1100) ||\
143 (!HWY_COMPILER_CLANG && HWY_COMPILER_GCC && HWY_COMPILER_GCC < 1000)
144 #define HWY_BROKEN_TARGETS (HWY_SVE | HWY_SVE2)
145 
146 #else
147 #define HWY_BROKEN_TARGETS 0
148 #endif
149 
150 #endif // HWY_BROKEN_TARGETS
151 
152 // Enabled means not disabled nor blocklisted.
153 #define HWY_ENABLED(targets) \
154  ((targets) & ~((HWY_DISABLED_TARGETS) | (HWY_BROKEN_TARGETS)))
155 
156 //------------------------------------------------------------------------------
157 // Detect baseline targets using predefined macros
158 
159 // Baseline means the targets for which the compiler is allowed to generate
160 // instructions, implying the target CPU would have to support them. Do not use
161 // this directly because it does not take the blocklist into account. Allow the
162 // user to override this without any guarantee of success.
163 #ifndef HWY_BASELINE_TARGETS
164 
165 // Also check HWY_ARCH to ensure that simulating unknown platforms ends up with
166 // HWY_TARGET == HWY_SCALAR.
167 
168 #if HWY_ARCH_WASM && defined(__wasm_simd128__)
169 #if defined(HWY_WANT_WASM2)
170 #define HWY_BASELINE_WASM HWY_WASM2
171 #else
172 #define HWY_BASELINE_WASM HWY_WASM
173 #endif // HWY_WANT_WASM2
174 #else
175 #define HWY_BASELINE_WASM 0
176 #endif
177 
178 // Avoid choosing the PPC target until we have an implementation.
179 #if HWY_ARCH_PPC && defined(__VSX__) && 0
180 #define HWY_BASELINE_PPC8 HWY_PPC8
181 #else
182 #define HWY_BASELINE_PPC8 0
183 #endif
184 
185 // SVE2 compiles, but is not yet tested.
186 #if HWY_ARCH_ARM && defined(__ARM_FEATURE_SVE2)
187 #define HWY_BASELINE_SVE2 HWY_SVE2
188 #else
189 #define HWY_BASELINE_SVE2 0
190 #endif
191 
192 #if HWY_ARCH_ARM && defined(__ARM_FEATURE_SVE)
193 #define HWY_BASELINE_SVE HWY_SVE
194 #else
195 #define HWY_BASELINE_SVE 0
196 #endif
197 
198 // GCC 4.5.4 only defines __ARM_NEON__; 5.4 defines both.
199 #if HWY_ARCH_ARM && (defined(__ARM_NEON__) || defined(__ARM_NEON))
200 #define HWY_BASELINE_NEON HWY_NEON
201 #else
202 #define HWY_BASELINE_NEON 0
203 #endif
204 
205 // Special handling for MSVC because it has fewer predefined macros
206 #if HWY_COMPILER_MSVC && !HWY_COMPILER_CLANG
207 
208 // We can only be sure SSSE3/SSE4 are enabled if AVX is
209 // (https://stackoverflow.com/questions/18563978/)
210 #if defined(__AVX__)
211 #define HWY_CHECK_SSSE3 1
212 #define HWY_CHECK_SSE4 1
213 #else
214 #define HWY_CHECK_SSSE3 0
215 #define HWY_CHECK_SSE4 0
216 #endif
217 
218 // Cannot check for PCLMUL/AES and BMI2/FMA/F16C individually; we assume
219 // PCLMUL/AES are available if SSE4 is, and BMI2/FMA/F16C if AVX2 is.
220 #define HWY_CHECK_PCLMUL_AES 1
221 #define HWY_CHECK_BMI2_FMA 1
222 #define HWY_CHECK_F16C 1
223 
224 #else // non-MSVC
225 
226 #if defined(__SSSE3__)
227 #define HWY_CHECK_SSSE3 1
228 #else
229 #define HWY_CHECK_SSSE3 0
230 #endif
231 
232 #if defined(__SSE4_1__) && defined(__SSE4_2__)
233 #define HWY_CHECK_SSE4 1
234 #else
235 #define HWY_CHECK_SSE4 0
236 #endif
237 
238 // If these are disabled, they should not gate the availability of SSE4/AVX2.
239 #if defined(HWY_DISABLE_PCLMUL_AES) || (defined(__PCLMUL__) && defined(__AES__))
240 #define HWY_CHECK_PCLMUL_AES 1
241 #else
242 #define HWY_CHECK_PCLMUL_AES 0
243 #endif
244 
245 #if defined(HWY_DISABLE_BMI2_FMA) || (defined(__BMI2__) && defined(__FMA__))
246 #define HWY_CHECK_BMI2_FMA 1
247 #else
248 #define HWY_CHECK_BMI2_FMA 0
249 #endif
250 
251 #if defined(HWY_DISABLE_F16C) || defined(__F16C__)
252 #define HWY_CHECK_F16C 1
253 #else
254 #define HWY_CHECK_F16C 0
255 #endif
256 
257 #endif // non-MSVC
258 
259 #if HWY_ARCH_X86 && HWY_CHECK_SSSE3
260 #define HWY_BASELINE_SSSE3 HWY_SSSE3
261 #else
262 #define HWY_BASELINE_SSSE3 0
263 #endif
264 
265 #if HWY_ARCH_X86 && HWY_CHECK_SSE4 && HWY_CHECK_PCLMUL_AES
266 #define HWY_BASELINE_SSE4 HWY_SSE4
267 #else
268 #define HWY_BASELINE_SSE4 0
269 #endif
270 
271 #if HWY_BASELINE_SSE4 != 0 && HWY_CHECK_BMI2_FMA && HWY_CHECK_F16C && \
272  defined(__AVX2__)
273 #define HWY_BASELINE_AVX2 HWY_AVX2
274 #else
275 #define HWY_BASELINE_AVX2 0
276 #endif
277 
278 // Require everything in AVX2 plus AVX-512 flags (also set by MSVC)
279 #if HWY_BASELINE_AVX2 != 0 && defined(__AVX512F__) && defined(__AVX512BW__) && \
280  defined(__AVX512DQ__) && defined(__AVX512VL__)
281 #define HWY_BASELINE_AVX3 HWY_AVX3
282 #else
283 #define HWY_BASELINE_AVX3 0
284 #endif
285 
286 // TODO(janwas): not yet known whether these will be set by MSVC
287 #if HWY_BASELINE_AVX3 != 0 && defined(__AVXVNNI__) && defined(__VAES__) && \
288  defined(__VPCLMULQDQ__)
289 #define HWY_BASELINE_AVX3_DL HWY_AVX3_DL
290 #else
291 #define HWY_BASELINE_AVX3_DL 0
292 #endif
293 
294 #if HWY_ARCH_RVV && defined(__riscv_vector)
295 #define HWY_BASELINE_RVV HWY_RVV
296 #else
297 #define HWY_BASELINE_RVV 0
298 #endif
299 
300 #define HWY_BASELINE_TARGETS \
301  (HWY_SCALAR | HWY_BASELINE_WASM | HWY_BASELINE_PPC8 | HWY_BASELINE_SVE2 | \
302  HWY_BASELINE_SVE | HWY_BASELINE_NEON | HWY_BASELINE_SSSE3 | \
303  HWY_BASELINE_SSE4 | HWY_BASELINE_AVX2 | HWY_BASELINE_AVX3 | \
304  HWY_BASELINE_AVX3_DL | HWY_BASELINE_RVV)
305 
306 #else
307 // User already defined HWY_BASELINE_TARGETS, but we still need to define
308 // HWY_BASELINE_AVX3 (matching user's definition) for HWY_CHECK_AVX3_DL.
309 #define HWY_BASELINE_AVX3_DL (HWY_BASELINE_TARGETS & HWY_AVX3_DL)
310 #endif // HWY_BASELINE_TARGETS
311 
312 //------------------------------------------------------------------------------
313 // Choose target for static dispatch
314 
315 #define HWY_ENABLED_BASELINE HWY_ENABLED(HWY_BASELINE_TARGETS)
316 #if HWY_ENABLED_BASELINE == 0
317 #error "At least one baseline target must be defined and enabled"
318 #endif
319 
320 // Best baseline, used for static dispatch. This is the least-significant 1-bit
321 // within HWY_ENABLED_BASELINE and lower bit values imply "better".
322 #define HWY_STATIC_TARGET (HWY_ENABLED_BASELINE & -HWY_ENABLED_BASELINE)
323 
324 // Start by assuming static dispatch. If we later use dynamic dispatch, this
325 // will be defined to other targets during the multiple-inclusion, and finally
326 // return to the initial value. Defining this outside begin/end_target ensures
327 // inl headers successfully compile by themselves (required by Bazel).
328 #define HWY_TARGET HWY_STATIC_TARGET
329 
330 //------------------------------------------------------------------------------
331 // Choose targets for dynamic dispatch according to one of four policies
332 
333 #if (defined(HWY_COMPILE_ONLY_SCALAR) + defined(HWY_COMPILE_ONLY_STATIC) + \
334  defined(HWY_COMPILE_ALL_ATTAINABLE)) > 1
335 #error "Invalid config: can only define a single policy for targets"
336 #endif
337 
338 // Further to checking for disabled/broken targets, we only use AVX3_DL after
339 // explicit opt-in (via this macro OR baseline compiler flags) to avoid
340 // generating a codepath which is only helpful if the app uses AVX3_DL features.
341 #if defined(HWY_WANT_AVX3_DL)
342 #define HWY_CHECK_AVX3_DL HWY_AVX3_DL
343 #else
344 #define HWY_CHECK_AVX3_DL HWY_BASELINE_AVX3_DL
345 #endif
346 
347 // Attainable means enabled and the compiler allows intrinsics (even when not
348 // allowed to autovectorize). Used in 3 and 4.
349 #if HWY_ARCH_X86
350 #define HWY_ATTAINABLE_TARGETS \
351  HWY_ENABLED(HWY_SCALAR | HWY_SSSE3 | HWY_SSE4 | HWY_AVX2 | HWY_AVX3 | \
352  HWY_CHECK_AVX3_DL)
353 #else
354 #define HWY_ATTAINABLE_TARGETS HWY_ENABLED_BASELINE
355 #endif
356 
357 // 1) For older compilers: disable all SIMD (could also set HWY_DISABLED_TARGETS
358 // to ~HWY_SCALAR, but this is more explicit).
359 #if defined(HWY_COMPILE_ONLY_SCALAR)
360 #undef HWY_STATIC_TARGET
361 #define HWY_STATIC_TARGET HWY_SCALAR // override baseline
362 #define HWY_TARGETS HWY_SCALAR
363 
364 // 2) For forcing static dispatch without code changes (removing HWY_EXPORT)
365 #elif defined(HWY_COMPILE_ONLY_STATIC)
366 #define HWY_TARGETS HWY_STATIC_TARGET
367 
368 // 3) For tests: include all attainable targets (in particular: scalar)
369 #elif defined(HWY_COMPILE_ALL_ATTAINABLE) || defined(HWY_IS_TEST)
370 #define HWY_TARGETS HWY_ATTAINABLE_TARGETS
371 
372 // 4) Default: attainable WITHOUT non-best baseline. This reduces code size by
373 // excluding superseded targets, in particular scalar.
374 #else
375 #define HWY_TARGETS (HWY_ATTAINABLE_TARGETS & (2 * HWY_STATIC_TARGET - 1))
376 
377 #endif // target policy
378 
379 // HWY_ONCE and the multiple-inclusion mechanism rely on HWY_STATIC_TARGET being
380 // one of the dynamic targets. This also implies HWY_TARGETS != 0 and
381 // (HWY_TARGETS & HWY_ENABLED_BASELINE) != 0.
382 #if (HWY_TARGETS & HWY_STATIC_TARGET) == 0
383 #error "Logic error: best baseline should be included in dynamic targets"
384 #endif
385 
386 #endif // HIGHWAY_HWY_DETECT_TARGETS_H_