Vector Optimized Library of Kernels  2.5.1
Architecture-tuned implementations of math kernels
volk_avx2_intrinsics.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2015 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
23 /*
24  * This file is intended to hold AVX2 intrinsics of intrinsics.
25  * They should be used in VOLK kernels to avoid copy-paste.
26  */
27 
28 #ifndef INCLUDE_VOLK_VOLK_AVX2_INTRINSICS_H_
29 #define INCLUDE_VOLK_VOLK_AVX2_INTRINSICS_H_
31 #include <immintrin.h>
32 
33 static inline __m256 _mm256_polar_sign_mask_avx2(__m128i fbits)
34 {
35  const __m128i zeros = _mm_set1_epi8(0x00);
36  const __m128i sign_extract = _mm_set1_epi8(0x80);
37  const __m256i shuffle_mask = _mm256_setr_epi8(0xff,
38  0xff,
39  0xff,
40  0x00,
41  0xff,
42  0xff,
43  0xff,
44  0x01,
45  0xff,
46  0xff,
47  0xff,
48  0x02,
49  0xff,
50  0xff,
51  0xff,
52  0x03,
53  0xff,
54  0xff,
55  0xff,
56  0x04,
57  0xff,
58  0xff,
59  0xff,
60  0x05,
61  0xff,
62  0xff,
63  0xff,
64  0x06,
65  0xff,
66  0xff,
67  0xff,
68  0x07);
69  __m256i sign_bits = _mm256_setzero_si256();
70 
71  fbits = _mm_cmpgt_epi8(fbits, zeros);
72  fbits = _mm_and_si128(fbits, sign_extract);
73  sign_bits = _mm256_insertf128_si256(sign_bits, fbits, 0);
74  sign_bits = _mm256_insertf128_si256(sign_bits, fbits, 1);
75  sign_bits = _mm256_shuffle_epi8(sign_bits, shuffle_mask);
76 
77  return _mm256_castsi256_ps(sign_bits);
78 }
79 
80 static inline __m256
81 _mm256_polar_fsign_add_llrs_avx2(__m256 src0, __m256 src1, __m128i fbits)
82 {
83  // prepare sign mask for correct +-
84  __m256 sign_mask = _mm256_polar_sign_mask_avx2(fbits);
85 
86  __m256 llr0, llr1;
87  _mm256_polar_deinterleave(&llr0, &llr1, src0, src1);
88 
89  // calculate result
90  llr0 = _mm256_xor_ps(llr0, sign_mask);
91  __m256 dst = _mm256_add_ps(llr0, llr1);
92  return dst;
93 }
94 
95 static inline __m256 _mm256_magnitudesquared_ps_avx2(const __m256 cplxValue0,
96  const __m256 cplxValue1)
97 {
98  const __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
99  const __m256 squared0 = _mm256_mul_ps(cplxValue0, cplxValue0); // Square the values
100  const __m256 squared1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the Values
101  const __m256 complex_result = _mm256_hadd_ps(squared0, squared1);
102  return _mm256_permutevar8x32_ps(complex_result, idx);
103 }
104 
105 static inline __m256 _mm256_scaled_norm_dist_ps_avx2(const __m256 symbols0,
106  const __m256 symbols1,
107  const __m256 points0,
108  const __m256 points1,
109  const __m256 scalar)
110 {
111  /*
112  * Calculate: |y - x|^2 * SNR_lin
113  * Consider 'symbolsX' and 'pointsX' to be complex float
114  * 'symbolsX' are 'y' and 'pointsX' are 'x'
115  */
116  const __m256 diff0 = _mm256_sub_ps(symbols0, points0);
117  const __m256 diff1 = _mm256_sub_ps(symbols1, points1);
118  const __m256 norms = _mm256_magnitudesquared_ps_avx2(diff0, diff1);
119  return _mm256_mul_ps(norms, scalar);
120 }
121 
122 /*
123  * The function below vectorizes the inner loop of the following code:
124  *
125  * float max_values[8] = {0.f};
126  * unsigned max_indices[8] = {0};
127  * unsigned current_indices[8] = {0, 1, 2, 3, 4, 5, 6, 7};
128  * for (unsigned i = 0; i < num_points / 8; ++i) {
129  * for (unsigned j = 0; j < 8; ++j) {
130  * float abs_squared = real(src0) * real(src0) + imag(src0) * imag(src1)
131  * bool compare = abs_squared > max_values[j];
132  * max_values[j] = compare ? abs_squared : max_values[j];
133  * max_indices[j] = compare ? current_indices[j] : max_indices[j]
134  * current_indices[j] += 8; // update for next outer loop iteration
135  * ++src0;
136  * }
137  * }
138  */
139 static inline void vector_32fc_index_max_variant0(__m256 in0,
140  __m256 in1,
141  __m256* max_values,
142  __m256i* max_indices,
143  __m256i* current_indices,
144  __m256i indices_increment)
145 {
146  in0 = _mm256_mul_ps(in0, in0);
147  in1 = _mm256_mul_ps(in1, in1);
148 
149  /*
150  * Given the vectors a = (a_7, a_6, …, a_1, a_0) and b = (b_7, b_6, …, b_1, b_0)
151  * hadd_ps(a, b) computes
152  * (b_7 + b_6,
153  * b_5 + b_4,
154  * ---------
155  * a_7 + b_6,
156  * a_5 + a_4,
157  * ---------
158  * b_3 + b_2,
159  * b_1 + b_0,
160  * ---------
161  * a_3 + a_2,
162  * a_1 + a_0).
163  * The result is the squared absolute value of complex numbers at index
164  * offsets (7, 6, 3, 2, 5, 4, 1, 0). This must be the initial value of
165  * current_indices!
166  */
167  __m256 abs_squared = _mm256_hadd_ps(in0, in1);
168 
169  /*
170  * Compare the recently computed squared absolute values with the
171  * previously determined maximum values. cmp_ps(a, b) determines
172  * a > b ? 0xFFFFFFFF for each element in the vectors =>
173  * compare_mask = abs_squared > max_values ? 0xFFFFFFFF : 0
174  *
175  * If either operand is NaN, 0 is returned as an “ordered” comparision is
176  * used => the blend operation will select the value from *max_values.
177  */
178  __m256 compare_mask = _mm256_cmp_ps(abs_squared, *max_values, _CMP_GT_OS);
179 
180  /* Select maximum by blending. This is the only line which differs from variant1 */
181  *max_values = _mm256_blendv_ps(*max_values, abs_squared, compare_mask);
182 
183  /*
184  * Updates indices: blendv_ps(a, b, mask) determines mask ? b : a for
185  * each element in the vectors =>
186  * max_indices = compare_mask ? current_indices : max_indices
187  *
188  * Note: The casting of data types is required to make the compiler happy
189  * and does not change values.
190  */
191  *max_indices =
192  _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(*max_indices),
193  _mm256_castsi256_ps(*current_indices),
194  compare_mask));
195 
196  /* compute indices of complex numbers which will be loaded in the next iteration */
197  *current_indices = _mm256_add_epi32(*current_indices, indices_increment);
198 }
199 
200 /* See _variant0 for details */
201 static inline void vector_32fc_index_max_variant1(__m256 in0,
202  __m256 in1,
203  __m256* max_values,
204  __m256i* max_indices,
205  __m256i* current_indices,
206  __m256i indices_increment)
207 {
208  in0 = _mm256_mul_ps(in0, in0);
209  in1 = _mm256_mul_ps(in1, in1);
210 
211  __m256 abs_squared = _mm256_hadd_ps(in0, in1);
212  __m256 compare_mask = _mm256_cmp_ps(abs_squared, *max_values, _CMP_GT_OS);
213 
214  /*
215  * This is the only line which differs from variant0. Using maxps instead of
216  * blendvps is faster on Intel CPUs (on the ones tested with).
217  *
218  * Note: The order of arguments matters if a NaN is encountered in which
219  * case the value of the second argument is selected. This is consistent
220  * with the “ordered” comparision and the blend operation: The comparision
221  * returns false if a NaN is encountered and the blend operation
222  * consequently selects the value from max_indices.
223  */
224  *max_values = _mm256_max_ps(abs_squared, *max_values);
225 
226  *max_indices =
227  _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(*max_indices),
228  _mm256_castsi256_ps(*current_indices),
229  compare_mask));
230 
231  *current_indices = _mm256_add_epi32(*current_indices, indices_increment);
232 }
233 
234 /*
235  * The function below vectorizes the inner loop of the following code:
236  *
237  * float min_values[8] = {FLT_MAX};
238  * unsigned min_indices[8] = {0};
239  * unsigned current_indices[8] = {0, 1, 2, 3, 4, 5, 6, 7};
240  * for (unsigned i = 0; i < num_points / 8; ++i) {
241  * for (unsigned j = 0; j < 8; ++j) {
242  * float abs_squared = real(src0) * real(src0) + imag(src0) * imag(src1)
243  * bool compare = abs_squared < min_values[j];
244  * min_values[j] = compare ? abs_squared : min_values[j];
245  * min_indices[j] = compare ? current_indices[j] : min_indices[j]
246  * current_indices[j] += 8; // update for next outer loop iteration
247  * ++src0;
248  * }
249  * }
250  */
251 static inline void vector_32fc_index_min_variant0(__m256 in0,
252  __m256 in1,
253  __m256* min_values,
254  __m256i* min_indices,
255  __m256i* current_indices,
256  __m256i indices_increment)
257 {
258  in0 = _mm256_mul_ps(in0, in0);
259  in1 = _mm256_mul_ps(in1, in1);
260 
261  /*
262  * Given the vectors a = (a_7, a_6, …, a_1, a_0) and b = (b_7, b_6, …, b_1, b_0)
263  * hadd_ps(a, b) computes
264  * (b_7 + b_6,
265  * b_5 + b_4,
266  * ---------
267  * a_7 + b_6,
268  * a_5 + a_4,
269  * ---------
270  * b_3 + b_2,
271  * b_1 + b_0,
272  * ---------
273  * a_3 + a_2,
274  * a_1 + a_0).
275  * The result is the squared absolute value of complex numbers at index
276  * offsets (7, 6, 3, 2, 5, 4, 1, 0). This must be the initial value of
277  * current_indices!
278  */
279  __m256 abs_squared = _mm256_hadd_ps(in0, in1);
280 
281  /*
282  * Compare the recently computed squared absolute values with the
283  * previously determined minimum values. cmp_ps(a, b) determines
284  * a < b ? 0xFFFFFFFF for each element in the vectors =>
285  * compare_mask = abs_squared < min_values ? 0xFFFFFFFF : 0
286  *
287  * If either operand is NaN, 0 is returned as an “ordered” comparision is
288  * used => the blend operation will select the value from *min_values.
289  */
290  __m256 compare_mask = _mm256_cmp_ps(abs_squared, *min_values, _CMP_LT_OS);
291 
292  /* Select minimum by blending. This is the only line which differs from variant1 */
293  *min_values = _mm256_blendv_ps(*min_values, abs_squared, compare_mask);
294 
295  /*
296  * Updates indices: blendv_ps(a, b, mask) determines mask ? b : a for
297  * each element in the vectors =>
298  * min_indices = compare_mask ? current_indices : min_indices
299  *
300  * Note: The casting of data types is required to make the compiler happy
301  * and does not change values.
302  */
303  *min_indices =
304  _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(*min_indices),
305  _mm256_castsi256_ps(*current_indices),
306  compare_mask));
307 
308  /* compute indices of complex numbers which will be loaded in the next iteration */
309  *current_indices = _mm256_add_epi32(*current_indices, indices_increment);
310 }
311 
312 /* See _variant0 for details */
313 static inline void vector_32fc_index_min_variant1(__m256 in0,
314  __m256 in1,
315  __m256* min_values,
316  __m256i* min_indices,
317  __m256i* current_indices,
318  __m256i indices_increment)
319 {
320  in0 = _mm256_mul_ps(in0, in0);
321  in1 = _mm256_mul_ps(in1, in1);
322 
323  __m256 abs_squared = _mm256_hadd_ps(in0, in1);
324  __m256 compare_mask = _mm256_cmp_ps(abs_squared, *min_values, _CMP_LT_OS);
325 
326  /*
327  * This is the only line which differs from variant0. Using maxps instead of
328  * blendvps is faster on Intel CPUs (on the ones tested with).
329  *
330  * Note: The order of arguments matters if a NaN is encountered in which
331  * case the value of the second argument is selected. This is consistent
332  * with the “ordered” comparision and the blend operation: The comparision
333  * returns false if a NaN is encountered and the blend operation
334  * consequently selects the value from min_indices.
335  */
336  *min_values = _mm256_min_ps(abs_squared, *min_values);
337 
338  *min_indices =
339  _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(*min_indices),
340  _mm256_castsi256_ps(*current_indices),
341  compare_mask));
342 
343  *current_indices = _mm256_add_epi32(*current_indices, indices_increment);
344 }
345 
346 #endif /* INCLUDE_VOLK_VOLK_AVX2_INTRINSICS_H_ */
static __m256 _mm256_scaled_norm_dist_ps_avx2(const __m256 symbols0, const __m256 symbols1, const __m256 points0, const __m256 points1, const __m256 scalar)
Definition: volk_avx2_intrinsics.h:105
static __m256 _mm256_polar_sign_mask_avx2(__m128i fbits)
Definition: volk_avx2_intrinsics.h:33
static void vector_32fc_index_max_variant1(__m256 in0, __m256 in1, __m256 *max_values, __m256i *max_indices, __m256i *current_indices, __m256i indices_increment)
Definition: volk_avx2_intrinsics.h:201
static __m256 _mm256_magnitudesquared_ps_avx2(const __m256 cplxValue0, const __m256 cplxValue1)
Definition: volk_avx2_intrinsics.h:95
static void vector_32fc_index_min_variant0(__m256 in0, __m256 in1, __m256 *min_values, __m256i *min_indices, __m256i *current_indices, __m256i indices_increment)
Definition: volk_avx2_intrinsics.h:251
static __m256 _mm256_polar_fsign_add_llrs_avx2(__m256 src0, __m256 src1, __m128i fbits)
Definition: volk_avx2_intrinsics.h:81
static void vector_32fc_index_max_variant0(__m256 in0, __m256 in1, __m256 *max_values, __m256i *max_indices, __m256i *current_indices, __m256i indices_increment)
Definition: volk_avx2_intrinsics.h:139
static void vector_32fc_index_min_variant1(__m256 in0, __m256 in1, __m256 *min_values, __m256i *min_indices, __m256i *current_indices, __m256i indices_increment)
Definition: volk_avx2_intrinsics.h:313
static void _mm256_polar_deinterleave(__m256 *llr0, __m256 *llr1, __m256 src0, __m256 src1)
Definition: volk_avx_intrinsics.h:158