Vector Optimized Library of Kernels  2.5.1
Architecture-tuned implementations of math kernels
volk_32f_log2_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
92 #ifndef INCLUDED_volk_32f_log2_32f_a_H
93 #define INCLUDED_volk_32f_log2_32f_a_H
94 
95 #include <inttypes.h>
96 #include <math.h>
97 #include <stdio.h>
98 #include <stdlib.h>
99 
100 #define LOG_POLY_DEGREE 6
101 
102 #ifdef LV_HAVE_GENERIC
103 
104 static inline void
105 volk_32f_log2_32f_generic(float* bVector, const float* aVector, unsigned int num_points)
106 {
107  float* bPtr = bVector;
108  const float* aPtr = aVector;
109  unsigned int number = 0;
110 
111  for (number = 0; number < num_points; number++)
112  *bPtr++ = log2f_non_ieee(*aPtr++);
113 }
114 #endif /* LV_HAVE_GENERIC */
115 
116 #if LV_HAVE_AVX2 && LV_HAVE_FMA
117 #include <immintrin.h>
118 
119 #define POLY0_FMAAVX2(x, c0) _mm256_set1_ps(c0)
120 #define POLY1_FMAAVX2(x, c0, c1) \
121  _mm256_fmadd_ps(POLY0_FMAAVX2(x, c1), x, _mm256_set1_ps(c0))
122 #define POLY2_FMAAVX2(x, c0, c1, c2) \
123  _mm256_fmadd_ps(POLY1_FMAAVX2(x, c1, c2), x, _mm256_set1_ps(c0))
124 #define POLY3_FMAAVX2(x, c0, c1, c2, c3) \
125  _mm256_fmadd_ps(POLY2_FMAAVX2(x, c1, c2, c3), x, _mm256_set1_ps(c0))
126 #define POLY4_FMAAVX2(x, c0, c1, c2, c3, c4) \
127  _mm256_fmadd_ps(POLY3_FMAAVX2(x, c1, c2, c3, c4), x, _mm256_set1_ps(c0))
128 #define POLY5_FMAAVX2(x, c0, c1, c2, c3, c4, c5) \
129  _mm256_fmadd_ps(POLY4_FMAAVX2(x, c1, c2, c3, c4, c5), x, _mm256_set1_ps(c0))
130 
131 static inline void volk_32f_log2_32f_a_avx2_fma(float* bVector,
132  const float* aVector,
133  unsigned int num_points)
134 {
135  float* bPtr = bVector;
136  const float* aPtr = aVector;
137 
138  unsigned int number = 0;
139  const unsigned int eighthPoints = num_points / 8;
140 
141  __m256 aVal, bVal, mantissa, frac, leadingOne;
142  __m256i bias, exp;
143 
144  for (; number < eighthPoints; number++) {
145 
146  aVal = _mm256_load_ps(aPtr);
147  bias = _mm256_set1_epi32(127);
148  leadingOne = _mm256_set1_ps(1.0f);
149  exp = _mm256_sub_epi32(
150  _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),
151  _mm256_set1_epi32(0x7f800000)),
152  23),
153  bias);
154  bVal = _mm256_cvtepi32_ps(exp);
155 
156  // Now to extract mantissa
157  frac = _mm256_or_ps(
158  leadingOne,
159  _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
160 
161 #if LOG_POLY_DEGREE == 6
162  mantissa = POLY5_FMAAVX2(frac,
163  3.1157899f,
164  -3.3241990f,
165  2.5988452f,
166  -1.2315303f,
167  3.1821337e-1f,
168  -3.4436006e-2f);
169 #elif LOG_POLY_DEGREE == 5
170  mantissa = POLY4_FMAAVX2(frac,
171  2.8882704548164776201f,
172  -2.52074962577807006663f,
173  1.48116647521213171641f,
174  -0.465725644288844778798f,
175  0.0596515482674574969533f);
176 #elif LOG_POLY_DEGREE == 4
177  mantissa = POLY3_FMAAVX2(frac,
178  2.61761038894603480148f,
179  -1.75647175389045657003f,
180  0.688243882994381274313f,
181  -0.107254423828329604454f);
182 #elif LOG_POLY_DEGREE == 3
183  mantissa = POLY2_FMAAVX2(frac,
184  2.28330284476918490682f,
185  -1.04913055217340124191f,
186  0.204446009836232697516f);
187 #else
188 #error
189 #endif
190 
191  bVal = _mm256_fmadd_ps(mantissa, _mm256_sub_ps(frac, leadingOne), bVal);
192  _mm256_store_ps(bPtr, bVal);
193 
194  aPtr += 8;
195  bPtr += 8;
196  }
197 
198  number = eighthPoints * 8;
199  volk_32f_log2_32f_generic(bPtr, aPtr, num_points - number);
200 }
201 
202 #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for aligned */
203 
204 #ifdef LV_HAVE_AVX2
205 #include <immintrin.h>
206 
207 #define POLY0_AVX2(x, c0) _mm256_set1_ps(c0)
208 #define POLY1_AVX2(x, c0, c1) \
209  _mm256_add_ps(_mm256_mul_ps(POLY0_AVX2(x, c1), x), _mm256_set1_ps(c0))
210 #define POLY2_AVX2(x, c0, c1, c2) \
211  _mm256_add_ps(_mm256_mul_ps(POLY1_AVX2(x, c1, c2), x), _mm256_set1_ps(c0))
212 #define POLY3_AVX2(x, c0, c1, c2, c3) \
213  _mm256_add_ps(_mm256_mul_ps(POLY2_AVX2(x, c1, c2, c3), x), _mm256_set1_ps(c0))
214 #define POLY4_AVX2(x, c0, c1, c2, c3, c4) \
215  _mm256_add_ps(_mm256_mul_ps(POLY3_AVX2(x, c1, c2, c3, c4), x), _mm256_set1_ps(c0))
216 #define POLY5_AVX2(x, c0, c1, c2, c3, c4, c5) \
217  _mm256_add_ps(_mm256_mul_ps(POLY4_AVX2(x, c1, c2, c3, c4, c5), x), _mm256_set1_ps(c0))
218 
219 static inline void
220 volk_32f_log2_32f_a_avx2(float* bVector, const float* aVector, unsigned int num_points)
221 {
222  float* bPtr = bVector;
223  const float* aPtr = aVector;
224 
225  unsigned int number = 0;
226  const unsigned int eighthPoints = num_points / 8;
227 
228  __m256 aVal, bVal, mantissa, frac, leadingOne;
229  __m256i bias, exp;
230 
231  for (; number < eighthPoints; number++) {
232 
233  aVal = _mm256_load_ps(aPtr);
234  bias = _mm256_set1_epi32(127);
235  leadingOne = _mm256_set1_ps(1.0f);
236  exp = _mm256_sub_epi32(
237  _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),
238  _mm256_set1_epi32(0x7f800000)),
239  23),
240  bias);
241  bVal = _mm256_cvtepi32_ps(exp);
242 
243  // Now to extract mantissa
244  frac = _mm256_or_ps(
245  leadingOne,
246  _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
247 
248 #if LOG_POLY_DEGREE == 6
249  mantissa = POLY5_AVX2(frac,
250  3.1157899f,
251  -3.3241990f,
252  2.5988452f,
253  -1.2315303f,
254  3.1821337e-1f,
255  -3.4436006e-2f);
256 #elif LOG_POLY_DEGREE == 5
257  mantissa = POLY4_AVX2(frac,
258  2.8882704548164776201f,
259  -2.52074962577807006663f,
260  1.48116647521213171641f,
261  -0.465725644288844778798f,
262  0.0596515482674574969533f);
263 #elif LOG_POLY_DEGREE == 4
264  mantissa = POLY3_AVX2(frac,
265  2.61761038894603480148f,
266  -1.75647175389045657003f,
267  0.688243882994381274313f,
268  -0.107254423828329604454f);
269 #elif LOG_POLY_DEGREE == 3
270  mantissa = POLY2_AVX2(frac,
271  2.28330284476918490682f,
272  -1.04913055217340124191f,
273  0.204446009836232697516f);
274 #else
275 #error
276 #endif
277 
278  bVal =
279  _mm256_add_ps(_mm256_mul_ps(mantissa, _mm256_sub_ps(frac, leadingOne)), bVal);
280  _mm256_store_ps(bPtr, bVal);
281 
282  aPtr += 8;
283  bPtr += 8;
284  }
285 
286  number = eighthPoints * 8;
287  volk_32f_log2_32f_generic(bPtr, aPtr, num_points - number);
288 }
289 
290 #endif /* LV_HAVE_AVX2 for aligned */
291 
292 #ifdef LV_HAVE_SSE4_1
293 #include <smmintrin.h>
294 
295 #define POLY0(x, c0) _mm_set1_ps(c0)
296 #define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
297 #define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
298 #define POLY3(x, c0, c1, c2, c3) \
299  _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
300 #define POLY4(x, c0, c1, c2, c3, c4) \
301  _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
302 #define POLY5(x, c0, c1, c2, c3, c4, c5) \
303  _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
304 
305 static inline void
306 volk_32f_log2_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
307 {
308  float* bPtr = bVector;
309  const float* aPtr = aVector;
310 
311  unsigned int number = 0;
312  const unsigned int quarterPoints = num_points / 4;
313 
314  __m128 aVal, bVal, mantissa, frac, leadingOne;
315  __m128i bias, exp;
316 
317  for (; number < quarterPoints; number++) {
318 
319  aVal = _mm_load_ps(aPtr);
320  bias = _mm_set1_epi32(127);
321  leadingOne = _mm_set1_ps(1.0f);
322  exp = _mm_sub_epi32(
323  _mm_srli_epi32(
324  _mm_and_si128(_mm_castps_si128(aVal), _mm_set1_epi32(0x7f800000)), 23),
325  bias);
326  bVal = _mm_cvtepi32_ps(exp);
327 
328  // Now to extract mantissa
329  frac = _mm_or_ps(leadingOne,
330  _mm_and_ps(aVal, _mm_castsi128_ps(_mm_set1_epi32(0x7fffff))));
331 
332 #if LOG_POLY_DEGREE == 6
333  mantissa = POLY5(frac,
334  3.1157899f,
335  -3.3241990f,
336  2.5988452f,
337  -1.2315303f,
338  3.1821337e-1f,
339  -3.4436006e-2f);
340 #elif LOG_POLY_DEGREE == 5
341  mantissa = POLY4(frac,
342  2.8882704548164776201f,
343  -2.52074962577807006663f,
344  1.48116647521213171641f,
345  -0.465725644288844778798f,
346  0.0596515482674574969533f);
347 #elif LOG_POLY_DEGREE == 4
348  mantissa = POLY3(frac,
349  2.61761038894603480148f,
350  -1.75647175389045657003f,
351  0.688243882994381274313f,
352  -0.107254423828329604454f);
353 #elif LOG_POLY_DEGREE == 3
354  mantissa = POLY2(frac,
355  2.28330284476918490682f,
356  -1.04913055217340124191f,
357  0.204446009836232697516f);
358 #else
359 #error
360 #endif
361 
362  bVal = _mm_add_ps(bVal, _mm_mul_ps(mantissa, _mm_sub_ps(frac, leadingOne)));
363  _mm_store_ps(bPtr, bVal);
364 
365  aPtr += 4;
366  bPtr += 4;
367  }
368 
369  number = quarterPoints * 4;
370  volk_32f_log2_32f_generic(bPtr, aPtr, num_points - number);
371 }
372 
373 #endif /* LV_HAVE_SSE4_1 for aligned */
374 
375 #ifdef LV_HAVE_NEON
376 #include <arm_neon.h>
377 
378 /* these macros allow us to embed logs in other kernels */
379 #define VLOG2Q_NEON_PREAMBLE() \
380  int32x4_t one = vdupq_n_s32(0x000800000); \
381  /* minimax polynomial */ \
382  float32x4_t p0 = vdupq_n_f32(-3.0400402727048585); \
383  float32x4_t p1 = vdupq_n_f32(6.1129631282966113); \
384  float32x4_t p2 = vdupq_n_f32(-5.3419892024633207); \
385  float32x4_t p3 = vdupq_n_f32(3.2865287703753912); \
386  float32x4_t p4 = vdupq_n_f32(-1.2669182593441635); \
387  float32x4_t p5 = vdupq_n_f32(0.2751487703421256); \
388  float32x4_t p6 = vdupq_n_f32(-0.0256910888150985); \
389  int32x4_t exp_mask = vdupq_n_s32(0x7f800000); \
390  int32x4_t sig_mask = vdupq_n_s32(0x007fffff); \
391  int32x4_t exp_bias = vdupq_n_s32(127);
392 
393 
394 #define VLOG2Q_NEON_F32(log2_approx, aval) \
395  int32x4_t exponent_i = vandq_s32(aval, exp_mask); \
396  int32x4_t significand_i = vandq_s32(aval, sig_mask); \
397  exponent_i = vshrq_n_s32(exponent_i, 23); \
398  \
399  /* extract the exponent and significand \
400  we can treat this as fixed point to save ~9% on the \
401  conversion + float add */ \
402  significand_i = vorrq_s32(one, significand_i); \
403  float32x4_t significand_f = vcvtq_n_f32_s32(significand_i, 23); \
404  /* debias the exponent and convert to float */ \
405  exponent_i = vsubq_s32(exponent_i, exp_bias); \
406  float32x4_t exponent_f = vcvtq_f32_s32(exponent_i); \
407  \
408  /* put the significand through a polynomial fit of log2(x) [1,2] \
409  add the result to the exponent */ \
410  log2_approx = vaddq_f32(exponent_f, p0); /* p0 */ \
411  float32x4_t tmp1 = vmulq_f32(significand_f, p1); /* p1 * x */ \
412  log2_approx = vaddq_f32(log2_approx, tmp1); \
413  float32x4_t sig_2 = vmulq_f32(significand_f, significand_f); /* x^2 */ \
414  tmp1 = vmulq_f32(sig_2, p2); /* p2 * x^2 */ \
415  log2_approx = vaddq_f32(log2_approx, tmp1); \
416  \
417  float32x4_t sig_3 = vmulq_f32(sig_2, significand_f); /* x^3 */ \
418  tmp1 = vmulq_f32(sig_3, p3); /* p3 * x^3 */ \
419  log2_approx = vaddq_f32(log2_approx, tmp1); \
420  float32x4_t sig_4 = vmulq_f32(sig_2, sig_2); /* x^4 */ \
421  tmp1 = vmulq_f32(sig_4, p4); /* p4 * x^4 */ \
422  log2_approx = vaddq_f32(log2_approx, tmp1); \
423  float32x4_t sig_5 = vmulq_f32(sig_3, sig_2); /* x^5 */ \
424  tmp1 = vmulq_f32(sig_5, p5); /* p5 * x^5 */ \
425  log2_approx = vaddq_f32(log2_approx, tmp1); \
426  float32x4_t sig_6 = vmulq_f32(sig_3, sig_3); /* x^6 */ \
427  tmp1 = vmulq_f32(sig_6, p6); /* p6 * x^6 */ \
428  log2_approx = vaddq_f32(log2_approx, tmp1);
429 
430 static inline void
431 volk_32f_log2_32f_neon(float* bVector, const float* aVector, unsigned int num_points)
432 {
433  float* bPtr = bVector;
434  const float* aPtr = aVector;
435  unsigned int number;
436  const unsigned int quarterPoints = num_points / 4;
437 
438  int32x4_t aval;
439  float32x4_t log2_approx;
440 
442  // lms
443  // p0 = vdupq_n_f32(-1.649132280361871);
444  // p1 = vdupq_n_f32(1.995047138579499);
445  // p2 = vdupq_n_f32(-0.336914839219728);
446 
447  // keep in mind a single precision float is represented as
448  // (-1)^sign * 2^exp * 1.significand, so the log2 is
449  // log2(2^exp * sig) = exponent + log2(1 + significand/(1<<23)
450  for (number = 0; number < quarterPoints; ++number) {
451  // load float in to an int register without conversion
452  aval = vld1q_s32((int*)aPtr);
453 
454  VLOG2Q_NEON_F32(log2_approx, aval)
455 
456  vst1q_f32(bPtr, log2_approx);
457 
458  aPtr += 4;
459  bPtr += 4;
460  }
461 
462  number = quarterPoints * 4;
463  volk_32f_log2_32f_generic(bPtr, aPtr, num_points - number);
464 }
465 
466 #endif /* LV_HAVE_NEON */
467 
468 
469 #endif /* INCLUDED_volk_32f_log2_32f_a_H */
470 
471 #ifndef INCLUDED_volk_32f_log2_32f_u_H
472 #define INCLUDED_volk_32f_log2_32f_u_H
473 
474 
475 #ifdef LV_HAVE_GENERIC
476 
477 static inline void
478 volk_32f_log2_32f_u_generic(float* bVector, const float* aVector, unsigned int num_points)
479 {
480  float* bPtr = bVector;
481  const float* aPtr = aVector;
482  unsigned int number = 0;
483 
484  for (number = 0; number < num_points; number++) {
485  float const result = log2f(*aPtr++);
486  *bPtr++ = isinf(result) ? -127.0f : result;
487  }
488 }
489 
490 #endif /* LV_HAVE_GENERIC */
491 
492 
493 #ifdef LV_HAVE_SSE4_1
494 #include <smmintrin.h>
495 
496 #define POLY0(x, c0) _mm_set1_ps(c0)
497 #define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
498 #define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
499 #define POLY3(x, c0, c1, c2, c3) \
500  _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
501 #define POLY4(x, c0, c1, c2, c3, c4) \
502  _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
503 #define POLY5(x, c0, c1, c2, c3, c4, c5) \
504  _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
505 
506 static inline void
507 volk_32f_log2_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
508 {
509  float* bPtr = bVector;
510  const float* aPtr = aVector;
511 
512  unsigned int number = 0;
513  const unsigned int quarterPoints = num_points / 4;
514 
515  __m128 aVal, bVal, mantissa, frac, leadingOne;
516  __m128i bias, exp;
517 
518  for (; number < quarterPoints; number++) {
519 
520  aVal = _mm_loadu_ps(aPtr);
521  bias = _mm_set1_epi32(127);
522  leadingOne = _mm_set1_ps(1.0f);
523  exp = _mm_sub_epi32(
524  _mm_srli_epi32(
525  _mm_and_si128(_mm_castps_si128(aVal), _mm_set1_epi32(0x7f800000)), 23),
526  bias);
527  bVal = _mm_cvtepi32_ps(exp);
528 
529  // Now to extract mantissa
530  frac = _mm_or_ps(leadingOne,
531  _mm_and_ps(aVal, _mm_castsi128_ps(_mm_set1_epi32(0x7fffff))));
532 
533 #if LOG_POLY_DEGREE == 6
534  mantissa = POLY5(frac,
535  3.1157899f,
536  -3.3241990f,
537  2.5988452f,
538  -1.2315303f,
539  3.1821337e-1f,
540  -3.4436006e-2f);
541 #elif LOG_POLY_DEGREE == 5
542  mantissa = POLY4(frac,
543  2.8882704548164776201f,
544  -2.52074962577807006663f,
545  1.48116647521213171641f,
546  -0.465725644288844778798f,
547  0.0596515482674574969533f);
548 #elif LOG_POLY_DEGREE == 4
549  mantissa = POLY3(frac,
550  2.61761038894603480148f,
551  -1.75647175389045657003f,
552  0.688243882994381274313f,
553  -0.107254423828329604454f);
554 #elif LOG_POLY_DEGREE == 3
555  mantissa = POLY2(frac,
556  2.28330284476918490682f,
557  -1.04913055217340124191f,
558  0.204446009836232697516f);
559 #else
560 #error
561 #endif
562 
563  bVal = _mm_add_ps(bVal, _mm_mul_ps(mantissa, _mm_sub_ps(frac, leadingOne)));
564  _mm_storeu_ps(bPtr, bVal);
565 
566  aPtr += 4;
567  bPtr += 4;
568  }
569 
570  number = quarterPoints * 4;
571  volk_32f_log2_32f_u_generic(bPtr, aPtr, num_points - number);
572 }
573 
574 #endif /* LV_HAVE_SSE4_1 for unaligned */
575 
576 #if LV_HAVE_AVX2 && LV_HAVE_FMA
577 #include <immintrin.h>
578 
579 #define POLY0_FMAAVX2(x, c0) _mm256_set1_ps(c0)
580 #define POLY1_FMAAVX2(x, c0, c1) \
581  _mm256_fmadd_ps(POLY0_FMAAVX2(x, c1), x, _mm256_set1_ps(c0))
582 #define POLY2_FMAAVX2(x, c0, c1, c2) \
583  _mm256_fmadd_ps(POLY1_FMAAVX2(x, c1, c2), x, _mm256_set1_ps(c0))
584 #define POLY3_FMAAVX2(x, c0, c1, c2, c3) \
585  _mm256_fmadd_ps(POLY2_FMAAVX2(x, c1, c2, c3), x, _mm256_set1_ps(c0))
586 #define POLY4_FMAAVX2(x, c0, c1, c2, c3, c4) \
587  _mm256_fmadd_ps(POLY3_FMAAVX2(x, c1, c2, c3, c4), x, _mm256_set1_ps(c0))
588 #define POLY5_FMAAVX2(x, c0, c1, c2, c3, c4, c5) \
589  _mm256_fmadd_ps(POLY4_FMAAVX2(x, c1, c2, c3, c4, c5), x, _mm256_set1_ps(c0))
590 
591 static inline void volk_32f_log2_32f_u_avx2_fma(float* bVector,
592  const float* aVector,
593  unsigned int num_points)
594 {
595  float* bPtr = bVector;
596  const float* aPtr = aVector;
597 
598  unsigned int number = 0;
599  const unsigned int eighthPoints = num_points / 8;
600 
601  __m256 aVal, bVal, mantissa, frac, leadingOne;
602  __m256i bias, exp;
603 
604  for (; number < eighthPoints; number++) {
605 
606  aVal = _mm256_loadu_ps(aPtr);
607  bias = _mm256_set1_epi32(127);
608  leadingOne = _mm256_set1_ps(1.0f);
609  exp = _mm256_sub_epi32(
610  _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),
611  _mm256_set1_epi32(0x7f800000)),
612  23),
613  bias);
614  bVal = _mm256_cvtepi32_ps(exp);
615 
616  // Now to extract mantissa
617  frac = _mm256_or_ps(
618  leadingOne,
619  _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
620 
621 #if LOG_POLY_DEGREE == 6
622  mantissa = POLY5_FMAAVX2(frac,
623  3.1157899f,
624  -3.3241990f,
625  2.5988452f,
626  -1.2315303f,
627  3.1821337e-1f,
628  -3.4436006e-2f);
629 #elif LOG_POLY_DEGREE == 5
630  mantissa = POLY4_FMAAVX2(frac,
631  2.8882704548164776201f,
632  -2.52074962577807006663f,
633  1.48116647521213171641f,
634  -0.465725644288844778798f,
635  0.0596515482674574969533f);
636 #elif LOG_POLY_DEGREE == 4
637  mantissa = POLY3_FMAAVX2(frac,
638  2.61761038894603480148f,
639  -1.75647175389045657003f,
640  0.688243882994381274313f,
641  -0.107254423828329604454f);
642 #elif LOG_POLY_DEGREE == 3
643  mantissa = POLY2_FMAAVX2(frac,
644  2.28330284476918490682f,
645  -1.04913055217340124191f,
646  0.204446009836232697516f);
647 #else
648 #error
649 #endif
650 
651  bVal = _mm256_fmadd_ps(mantissa, _mm256_sub_ps(frac, leadingOne), bVal);
652  _mm256_storeu_ps(bPtr, bVal);
653 
654  aPtr += 8;
655  bPtr += 8;
656  }
657 
658  number = eighthPoints * 8;
659  volk_32f_log2_32f_u_generic(bPtr, aPtr, num_points - number);
660 }
661 
662 #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for unaligned */
663 
664 #ifdef LV_HAVE_AVX2
665 #include <immintrin.h>
666 
667 #define POLY0_AVX2(x, c0) _mm256_set1_ps(c0)
668 #define POLY1_AVX2(x, c0, c1) \
669  _mm256_add_ps(_mm256_mul_ps(POLY0_AVX2(x, c1), x), _mm256_set1_ps(c0))
670 #define POLY2_AVX2(x, c0, c1, c2) \
671  _mm256_add_ps(_mm256_mul_ps(POLY1_AVX2(x, c1, c2), x), _mm256_set1_ps(c0))
672 #define POLY3_AVX2(x, c0, c1, c2, c3) \
673  _mm256_add_ps(_mm256_mul_ps(POLY2_AVX2(x, c1, c2, c3), x), _mm256_set1_ps(c0))
674 #define POLY4_AVX2(x, c0, c1, c2, c3, c4) \
675  _mm256_add_ps(_mm256_mul_ps(POLY3_AVX2(x, c1, c2, c3, c4), x), _mm256_set1_ps(c0))
676 #define POLY5_AVX2(x, c0, c1, c2, c3, c4, c5) \
677  _mm256_add_ps(_mm256_mul_ps(POLY4_AVX2(x, c1, c2, c3, c4, c5), x), _mm256_set1_ps(c0))
678 
679 static inline void
680 volk_32f_log2_32f_u_avx2(float* bVector, const float* aVector, unsigned int num_points)
681 {
682  float* bPtr = bVector;
683  const float* aPtr = aVector;
684 
685  unsigned int number = 0;
686  const unsigned int eighthPoints = num_points / 8;
687 
688  __m256 aVal, bVal, mantissa, frac, leadingOne;
689  __m256i bias, exp;
690 
691  for (; number < eighthPoints; number++) {
692 
693  aVal = _mm256_loadu_ps(aPtr);
694  bias = _mm256_set1_epi32(127);
695  leadingOne = _mm256_set1_ps(1.0f);
696  exp = _mm256_sub_epi32(
697  _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),
698  _mm256_set1_epi32(0x7f800000)),
699  23),
700  bias);
701  bVal = _mm256_cvtepi32_ps(exp);
702 
703  // Now to extract mantissa
704  frac = _mm256_or_ps(
705  leadingOne,
706  _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
707 
708 #if LOG_POLY_DEGREE == 6
709  mantissa = POLY5_AVX2(frac,
710  3.1157899f,
711  -3.3241990f,
712  2.5988452f,
713  -1.2315303f,
714  3.1821337e-1f,
715  -3.4436006e-2f);
716 #elif LOG_POLY_DEGREE == 5
717  mantissa = POLY4_AVX2(frac,
718  2.8882704548164776201f,
719  -2.52074962577807006663f,
720  1.48116647521213171641f,
721  -0.465725644288844778798f,
722  0.0596515482674574969533f);
723 #elif LOG_POLY_DEGREE == 4
724  mantissa = POLY3_AVX2(frac,
725  2.61761038894603480148f,
726  -1.75647175389045657003f,
727  0.688243882994381274313f,
728  -0.107254423828329604454f);
729 #elif LOG_POLY_DEGREE == 3
730  mantissa = POLY2_AVX2(frac,
731  2.28330284476918490682f,
732  -1.04913055217340124191f,
733  0.204446009836232697516f);
734 #else
735 #error
736 #endif
737 
738  bVal =
739  _mm256_add_ps(_mm256_mul_ps(mantissa, _mm256_sub_ps(frac, leadingOne)), bVal);
740  _mm256_storeu_ps(bPtr, bVal);
741 
742  aPtr += 8;
743  bPtr += 8;
744  }
745 
746  number = eighthPoints * 8;
747  volk_32f_log2_32f_u_generic(bPtr, aPtr, num_points - number);
748 }
749 
750 #endif /* LV_HAVE_AVX2 for unaligned */
751 
752 
753 #endif /* INCLUDED_volk_32f_log2_32f_u_H */
static void volk_32f_log2_32f_u_generic(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_log2_32f.h:475
static void volk_32f_log2_32f_neon(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_log2_32f.h:428
static void volk_32f_log2_32f_generic(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_log2_32f.h:105
#define VLOG2Q_NEON_F32(log2_approx, aval)
Definition: volk_32f_log2_32f.h:394
#define VLOG2Q_NEON_PREAMBLE()
Definition: volk_32f_log2_32f.h:379
static float log2f_non_ieee(float f)
Definition: volk_common.h:150