Vector Optimized Library of Kernels  2.5.1
Architecture-tuned implementations of math kernels
volk_32f_cos_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
72 #include <inttypes.h>
73 #include <math.h>
74 #include <stdio.h>
75 
76 #ifndef INCLUDED_volk_32f_cos_32f_a_H
77 #define INCLUDED_volk_32f_cos_32f_a_H
78 
79 #ifdef LV_HAVE_AVX512F
80 
81 #include <immintrin.h>
82 static inline void volk_32f_cos_32f_a_avx512f(float* cosVector,
83  const float* inVector,
84  unsigned int num_points)
85 {
86  float* cosPtr = cosVector;
87  const float* inPtr = inVector;
88 
89  unsigned int number = 0;
90  unsigned int sixteenPoints = num_points / 16;
91  unsigned int i = 0;
92 
93  __m512 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos,
94  fones, sine, cosine;
95  __m512i q, zeros, ones, twos, fours;
96 
97  m4pi = _mm512_set1_ps(1.273239544735162542821171882678754627704620361328125);
98  pio4A = _mm512_set1_ps(0.7853981554508209228515625);
99  pio4B = _mm512_set1_ps(0.794662735614792836713604629039764404296875e-8);
100  pio4C = _mm512_set1_ps(0.306161699786838294306516483068750264552437361480769e-16);
101  ffours = _mm512_set1_ps(4.0);
102  ftwos = _mm512_set1_ps(2.0);
103  fones = _mm512_set1_ps(1.0);
104  zeros = _mm512_setzero_epi32();
105  ones = _mm512_set1_epi32(1);
106  twos = _mm512_set1_epi32(2);
107  fours = _mm512_set1_epi32(4);
108 
109  cp1 = _mm512_set1_ps(1.0);
110  cp2 = _mm512_set1_ps(0.08333333333333333);
111  cp3 = _mm512_set1_ps(0.002777777777777778);
112  cp4 = _mm512_set1_ps(4.96031746031746e-05);
113  cp5 = _mm512_set1_ps(5.511463844797178e-07);
114  __mmask16 condition1, condition2;
115 
116  for (; number < sixteenPoints; number++) {
117  aVal = _mm512_load_ps(inPtr);
118  // s = fabs(aVal)
119  s = (__m512)(_mm512_and_si512((__m512i)(aVal), _mm512_set1_epi32(0x7fffffff)));
120 
121  // q = (int) (s * (4/pi)), floor(aVal / (pi/4))
122  q = _mm512_cvtps_epi32(_mm512_floor_ps(_mm512_mul_ps(s, m4pi)));
123  // r = q + q&1, q indicates quadrant, r gives
124  r = _mm512_cvtepi32_ps(_mm512_add_epi32(q, _mm512_and_si512(q, ones)));
125 
126  s = _mm512_fnmadd_ps(r, pio4A, s);
127  s = _mm512_fnmadd_ps(r, pio4B, s);
128  s = _mm512_fnmadd_ps(r, pio4C, s);
129 
130  s = _mm512_div_ps(
131  s,
132  _mm512_set1_ps(8.0f)); // The constant is 2^N, for 3 times argument reduction
133  s = _mm512_mul_ps(s, s);
134  // Evaluate Taylor series
135  s = _mm512_mul_ps(
136  _mm512_fmadd_ps(
137  _mm512_fmsub_ps(
138  _mm512_fmadd_ps(_mm512_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2),
139  s,
140  cp1),
141  s);
142 
143  for (i = 0; i < 3; i++)
144  s = _mm512_mul_ps(s, _mm512_sub_ps(ffours, s));
145  s = _mm512_div_ps(s, ftwos);
146 
147  sine = _mm512_sqrt_ps(_mm512_mul_ps(_mm512_sub_ps(ftwos, s), s));
148  cosine = _mm512_sub_ps(fones, s);
149 
150  // if(((q+1)&2) != 0) { cosine=sine;}
151  condition1 = _mm512_cmpneq_epi32_mask(
152  _mm512_and_si512(_mm512_add_epi32(q, ones), twos), zeros);
153 
154  // if(((q+2)&4) != 0) { cosine = -cosine;}
155  condition2 = _mm512_cmpneq_epi32_mask(
156  _mm512_and_si512(_mm512_add_epi32(q, twos), fours), zeros);
157  cosine = _mm512_mask_blend_ps(condition1, cosine, sine);
158  cosine = _mm512_mask_mul_ps(cosine, condition2, cosine, _mm512_set1_ps(-1.f));
159  _mm512_store_ps(cosPtr, cosine);
160  inPtr += 16;
161  cosPtr += 16;
162  }
163 
164  number = sixteenPoints * 16;
165  for (; number < num_points; number++) {
166  *cosPtr++ = cosf(*inPtr++);
167  }
168 }
169 #endif
170 
171 #if LV_HAVE_AVX2 && LV_HAVE_FMA
172 #include <immintrin.h>
173 
174 static inline void
175 volk_32f_cos_32f_a_avx2_fma(float* bVector, const float* aVector, unsigned int num_points)
176 {
177  float* bPtr = bVector;
178  const float* aPtr = aVector;
179 
180  unsigned int number = 0;
181  unsigned int eighthPoints = num_points / 8;
182  unsigned int i = 0;
183 
184  __m256 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos,
185  fones, fzeroes;
186  __m256 sine, cosine;
187  __m256i q, ones, twos, fours;
188 
189  m4pi = _mm256_set1_ps(1.273239544735162542821171882678754627704620361328125);
190  pio4A = _mm256_set1_ps(0.7853981554508209228515625);
191  pio4B = _mm256_set1_ps(0.794662735614792836713604629039764404296875e-8);
192  pio4C = _mm256_set1_ps(0.306161699786838294306516483068750264552437361480769e-16);
193  ffours = _mm256_set1_ps(4.0);
194  ftwos = _mm256_set1_ps(2.0);
195  fones = _mm256_set1_ps(1.0);
196  fzeroes = _mm256_setzero_ps();
197  __m256i zeroes = _mm256_set1_epi32(0);
198  ones = _mm256_set1_epi32(1);
199  __m256i allones = _mm256_set1_epi32(0xffffffff);
200  twos = _mm256_set1_epi32(2);
201  fours = _mm256_set1_epi32(4);
202 
203  cp1 = _mm256_set1_ps(1.0);
204  cp2 = _mm256_set1_ps(0.08333333333333333);
205  cp3 = _mm256_set1_ps(0.002777777777777778);
206  cp4 = _mm256_set1_ps(4.96031746031746e-05);
207  cp5 = _mm256_set1_ps(5.511463844797178e-07);
208  union bit256 condition1;
209  union bit256 condition3;
210 
211  for (; number < eighthPoints; number++) {
212 
213  aVal = _mm256_load_ps(aPtr);
214  // s = fabs(aVal)
215  s = _mm256_sub_ps(aVal,
216  _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
217  _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
218  // q = (int) (s * (4/pi)), floor(aVal / (pi/4))
219  q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
220  // r = q + q&1, q indicates quadrant, r gives
221  r = _mm256_cvtepi32_ps(_mm256_add_epi32(q, _mm256_and_si256(q, ones)));
222 
223  s = _mm256_fnmadd_ps(r, pio4A, s);
224  s = _mm256_fnmadd_ps(r, pio4B, s);
225  s = _mm256_fnmadd_ps(r, pio4C, s);
226 
227  s = _mm256_div_ps(
228  s,
229  _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
230  s = _mm256_mul_ps(s, s);
231  // Evaluate Taylor series
232  s = _mm256_mul_ps(
233  _mm256_fmadd_ps(
234  _mm256_fmsub_ps(
235  _mm256_fmadd_ps(_mm256_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2),
236  s,
237  cp1),
238  s);
239 
240  for (i = 0; i < 3; i++)
241  s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
242  s = _mm256_div_ps(s, ftwos);
243 
244  sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
245  cosine = _mm256_sub_ps(fones, s);
246 
247  // if(((q+1)&2) != 0) { cosine=sine;}
248  condition1.int_vec =
249  _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_add_epi32(q, ones), twos), zeroes);
250  condition1.int_vec = _mm256_xor_si256(allones, condition1.int_vec);
251 
252  // if(((q+2)&4) != 0) { cosine = -cosine;}
253  condition3.int_vec = _mm256_cmpeq_epi32(
254  _mm256_and_si256(_mm256_add_epi32(q, twos), fours), zeroes);
255  condition3.int_vec = _mm256_xor_si256(allones, condition3.int_vec);
256 
257  cosine = _mm256_add_ps(
258  cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1.float_vec));
259  cosine = _mm256_sub_ps(cosine,
260  _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)),
261  condition3.float_vec));
262  _mm256_store_ps(bPtr, cosine);
263  aPtr += 8;
264  bPtr += 8;
265  }
266 
267  number = eighthPoints * 8;
268  for (; number < num_points; number++) {
269  *bPtr++ = cos(*aPtr++);
270  }
271 }
272 
273 #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for aligned */
274 
275 #ifdef LV_HAVE_AVX2
276 #include <immintrin.h>
277 
278 static inline void
279 volk_32f_cos_32f_a_avx2(float* bVector, const float* aVector, unsigned int num_points)
280 {
281  float* bPtr = bVector;
282  const float* aPtr = aVector;
283 
284  unsigned int number = 0;
285  unsigned int eighthPoints = num_points / 8;
286  unsigned int i = 0;
287 
288  __m256 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos,
289  fones, fzeroes;
290  __m256 sine, cosine;
291  __m256i q, ones, twos, fours;
292 
293  m4pi = _mm256_set1_ps(1.273239544735162542821171882678754627704620361328125);
294  pio4A = _mm256_set1_ps(0.7853981554508209228515625);
295  pio4B = _mm256_set1_ps(0.794662735614792836713604629039764404296875e-8);
296  pio4C = _mm256_set1_ps(0.306161699786838294306516483068750264552437361480769e-16);
297  ffours = _mm256_set1_ps(4.0);
298  ftwos = _mm256_set1_ps(2.0);
299  fones = _mm256_set1_ps(1.0);
300  fzeroes = _mm256_setzero_ps();
301  __m256i zeroes = _mm256_set1_epi32(0);
302  ones = _mm256_set1_epi32(1);
303  __m256i allones = _mm256_set1_epi32(0xffffffff);
304  twos = _mm256_set1_epi32(2);
305  fours = _mm256_set1_epi32(4);
306 
307  cp1 = _mm256_set1_ps(1.0);
308  cp2 = _mm256_set1_ps(0.08333333333333333);
309  cp3 = _mm256_set1_ps(0.002777777777777778);
310  cp4 = _mm256_set1_ps(4.96031746031746e-05);
311  cp5 = _mm256_set1_ps(5.511463844797178e-07);
312  union bit256 condition1;
313  union bit256 condition3;
314 
315  for (; number < eighthPoints; number++) {
316 
317  aVal = _mm256_load_ps(aPtr);
318  // s = fabs(aVal)
319  s = _mm256_sub_ps(aVal,
320  _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
321  _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
322  // q = (int) (s * (4/pi)), floor(aVal / (pi/4))
323  q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
324  // r = q + q&1, q indicates quadrant, r gives
325  r = _mm256_cvtepi32_ps(_mm256_add_epi32(q, _mm256_and_si256(q, ones)));
326 
327  s = _mm256_sub_ps(s, _mm256_mul_ps(r, pio4A));
328  s = _mm256_sub_ps(s, _mm256_mul_ps(r, pio4B));
329  s = _mm256_sub_ps(s, _mm256_mul_ps(r, pio4C));
330 
331  s = _mm256_div_ps(
332  s,
333  _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
334  s = _mm256_mul_ps(s, s);
335  // Evaluate Taylor series
336  s = _mm256_mul_ps(
337  _mm256_add_ps(
338  _mm256_mul_ps(
339  _mm256_sub_ps(
340  _mm256_mul_ps(
341  _mm256_add_ps(
342  _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(s, cp5), cp4),
343  s),
344  cp3),
345  s),
346  cp2),
347  s),
348  cp1),
349  s);
350 
351  for (i = 0; i < 3; i++)
352  s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
353  s = _mm256_div_ps(s, ftwos);
354 
355  sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
356  cosine = _mm256_sub_ps(fones, s);
357 
358  // if(((q+1)&2) != 0) { cosine=sine;}
359  condition1.int_vec =
360  _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_add_epi32(q, ones), twos), zeroes);
361  condition1.int_vec = _mm256_xor_si256(allones, condition1.int_vec);
362 
363  // if(((q+2)&4) != 0) { cosine = -cosine;}
364  condition3.int_vec = _mm256_cmpeq_epi32(
365  _mm256_and_si256(_mm256_add_epi32(q, twos), fours), zeroes);
366  condition3.int_vec = _mm256_xor_si256(allones, condition3.int_vec);
367 
368  cosine = _mm256_add_ps(
369  cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1.float_vec));
370  cosine = _mm256_sub_ps(cosine,
371  _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)),
372  condition3.float_vec));
373  _mm256_store_ps(bPtr, cosine);
374  aPtr += 8;
375  bPtr += 8;
376  }
377 
378  number = eighthPoints * 8;
379  for (; number < num_points; number++) {
380  *bPtr++ = cos(*aPtr++);
381  }
382 }
383 
384 #endif /* LV_HAVE_AVX2 for aligned */
385 
386 #ifdef LV_HAVE_SSE4_1
387 #include <smmintrin.h>
388 
389 static inline void
390 volk_32f_cos_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
391 {
392  float* bPtr = bVector;
393  const float* aPtr = aVector;
394 
395  unsigned int number = 0;
396  unsigned int quarterPoints = num_points / 4;
397  unsigned int i = 0;
398 
399  __m128 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos,
400  fones, fzeroes;
401  __m128 sine, cosine;
402  __m128i q, ones, twos, fours;
403 
404  m4pi = _mm_set1_ps(1.273239544735162542821171882678754627704620361328125);
405  pio4A = _mm_set1_ps(0.7853981554508209228515625);
406  pio4B = _mm_set1_ps(0.794662735614792836713604629039764404296875e-8);
407  pio4C = _mm_set1_ps(0.306161699786838294306516483068750264552437361480769e-16);
408  ffours = _mm_set1_ps(4.0);
409  ftwos = _mm_set1_ps(2.0);
410  fones = _mm_set1_ps(1.0);
411  fzeroes = _mm_setzero_ps();
412  __m128i zeroes = _mm_set1_epi32(0);
413  ones = _mm_set1_epi32(1);
414  __m128i allones = _mm_set1_epi32(0xffffffff);
415  twos = _mm_set1_epi32(2);
416  fours = _mm_set1_epi32(4);
417 
418  cp1 = _mm_set1_ps(1.0);
419  cp2 = _mm_set1_ps(0.08333333333333333);
420  cp3 = _mm_set1_ps(0.002777777777777778);
421  cp4 = _mm_set1_ps(4.96031746031746e-05);
422  cp5 = _mm_set1_ps(5.511463844797178e-07);
423  union bit128 condition1;
424  union bit128 condition3;
425 
426  for (; number < quarterPoints; number++) {
427 
428  aVal = _mm_load_ps(aPtr);
429  // s = fabs(aVal)
430  s = _mm_sub_ps(aVal,
431  _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes)));
432  // q = (int) (s * (4/pi)), floor(aVal / (pi/4))
433  q = _mm_cvtps_epi32(_mm_floor_ps(_mm_mul_ps(s, m4pi)));
434  // r = q + q&1, q indicates quadrant, r gives
435  r = _mm_cvtepi32_ps(_mm_add_epi32(q, _mm_and_si128(q, ones)));
436 
437  s = _mm_sub_ps(s, _mm_mul_ps(r, pio4A));
438  s = _mm_sub_ps(s, _mm_mul_ps(r, pio4B));
439  s = _mm_sub_ps(s, _mm_mul_ps(r, pio4C));
440 
441  s = _mm_div_ps(
442  s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
443  s = _mm_mul_ps(s, s);
444  // Evaluate Taylor series
445  s = _mm_mul_ps(
446  _mm_add_ps(
447  _mm_mul_ps(
448  _mm_sub_ps(
449  _mm_mul_ps(
450  _mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s),
451  cp3),
452  s),
453  cp2),
454  s),
455  cp1),
456  s);
457 
458  for (i = 0; i < 3; i++)
459  s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
460  s = _mm_div_ps(s, ftwos);
461 
462  sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
463  cosine = _mm_sub_ps(fones, s);
464 
465  // if(((q+1)&2) != 0) { cosine=sine;}
466  condition1.int_vec =
467  _mm_cmpeq_epi32(_mm_and_si128(_mm_add_epi32(q, ones), twos), zeroes);
468  condition1.int_vec = _mm_xor_si128(allones, condition1.int_vec);
469 
470  // if(((q+2)&4) != 0) { cosine = -cosine;}
471  condition3.int_vec =
472  _mm_cmpeq_epi32(_mm_and_si128(_mm_add_epi32(q, twos), fours), zeroes);
473  condition3.int_vec = _mm_xor_si128(allones, condition3.int_vec);
474 
475  cosine = _mm_add_ps(cosine,
476  _mm_and_ps(_mm_sub_ps(sine, cosine), condition1.float_vec));
477  cosine = _mm_sub_ps(
478  cosine,
479  _mm_and_ps(_mm_mul_ps(cosine, _mm_set1_ps(2.0f)), condition3.float_vec));
480  _mm_store_ps(bPtr, cosine);
481  aPtr += 4;
482  bPtr += 4;
483  }
484 
485  number = quarterPoints * 4;
486  for (; number < num_points; number++) {
487  *bPtr++ = cosf(*aPtr++);
488  }
489 }
490 
491 #endif /* LV_HAVE_SSE4_1 for aligned */
492 
493 #endif /* INCLUDED_volk_32f_cos_32f_a_H */
494 
495 
496 #ifndef INCLUDED_volk_32f_cos_32f_u_H
497 #define INCLUDED_volk_32f_cos_32f_u_H
498 
499 #ifdef LV_HAVE_AVX512F
500 
501 #include <immintrin.h>
502 static inline void volk_32f_cos_32f_u_avx512f(float* cosVector,
503  const float* inVector,
504  unsigned int num_points)
505 {
506  float* cosPtr = cosVector;
507  const float* inPtr = inVector;
508 
509  unsigned int number = 0;
510  unsigned int sixteenPoints = num_points / 16;
511  unsigned int i = 0;
512 
513  __m512 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos,
514  fones, sine, cosine;
515  __m512i q, zeros, ones, twos, fours;
516 
517  m4pi = _mm512_set1_ps(1.273239544735162542821171882678754627704620361328125);
518  pio4A = _mm512_set1_ps(0.7853981554508209228515625);
519  pio4B = _mm512_set1_ps(0.794662735614792836713604629039764404296875e-8);
520  pio4C = _mm512_set1_ps(0.306161699786838294306516483068750264552437361480769e-16);
521  ffours = _mm512_set1_ps(4.0);
522  ftwos = _mm512_set1_ps(2.0);
523  fones = _mm512_set1_ps(1.0);
524  zeros = _mm512_setzero_epi32();
525  ones = _mm512_set1_epi32(1);
526  twos = _mm512_set1_epi32(2);
527  fours = _mm512_set1_epi32(4);
528 
529  cp1 = _mm512_set1_ps(1.0);
530  cp2 = _mm512_set1_ps(0.08333333333333333);
531  cp3 = _mm512_set1_ps(0.002777777777777778);
532  cp4 = _mm512_set1_ps(4.96031746031746e-05);
533  cp5 = _mm512_set1_ps(5.511463844797178e-07);
534  __mmask16 condition1, condition2;
535  for (; number < sixteenPoints; number++) {
536  aVal = _mm512_loadu_ps(inPtr);
537  // s = fabs(aVal)
538  s = (__m512)(_mm512_and_si512((__m512i)(aVal), _mm512_set1_epi32(0x7fffffff)));
539 
540  // q = (int) (s * (4/pi)), floor(aVal / (pi/4))
541  q = _mm512_cvtps_epi32(_mm512_floor_ps(_mm512_mul_ps(s, m4pi)));
542  // r = q + q&1, q indicates quadrant, r gives
543  r = _mm512_cvtepi32_ps(_mm512_add_epi32(q, _mm512_and_si512(q, ones)));
544 
545  s = _mm512_fnmadd_ps(r, pio4A, s);
546  s = _mm512_fnmadd_ps(r, pio4B, s);
547  s = _mm512_fnmadd_ps(r, pio4C, s);
548 
549  s = _mm512_div_ps(
550  s,
551  _mm512_set1_ps(8.0f)); // The constant is 2^N, for 3 times argument reduction
552  s = _mm512_mul_ps(s, s);
553  // Evaluate Taylor series
554  s = _mm512_mul_ps(
555  _mm512_fmadd_ps(
556  _mm512_fmsub_ps(
557  _mm512_fmadd_ps(_mm512_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2),
558  s,
559  cp1),
560  s);
561 
562  for (i = 0; i < 3; i++)
563  s = _mm512_mul_ps(s, _mm512_sub_ps(ffours, s));
564  s = _mm512_div_ps(s, ftwos);
565 
566  sine = _mm512_sqrt_ps(_mm512_mul_ps(_mm512_sub_ps(ftwos, s), s));
567  cosine = _mm512_sub_ps(fones, s);
568 
569  // if(((q+1)&2) != 0) { cosine=sine;}
570  condition1 = _mm512_cmpneq_epi32_mask(
571  _mm512_and_si512(_mm512_add_epi32(q, ones), twos), zeros);
572 
573  // if(((q+2)&4) != 0) { cosine = -cosine;}
574  condition2 = _mm512_cmpneq_epi32_mask(
575  _mm512_and_si512(_mm512_add_epi32(q, twos), fours), zeros);
576 
577  cosine = _mm512_mask_blend_ps(condition1, cosine, sine);
578  cosine = _mm512_mask_mul_ps(cosine, condition2, cosine, _mm512_set1_ps(-1.f));
579  _mm512_storeu_ps(cosPtr, cosine);
580  inPtr += 16;
581  cosPtr += 16;
582  }
583 
584  number = sixteenPoints * 16;
585  for (; number < num_points; number++) {
586  *cosPtr++ = cosf(*inPtr++);
587  }
588 }
589 #endif
590 
591 #if LV_HAVE_AVX2 && LV_HAVE_FMA
592 #include <immintrin.h>
593 
594 static inline void
595 volk_32f_cos_32f_u_avx2_fma(float* bVector, const float* aVector, unsigned int num_points)
596 {
597  float* bPtr = bVector;
598  const float* aPtr = aVector;
599 
600  unsigned int number = 0;
601  unsigned int eighthPoints = num_points / 8;
602  unsigned int i = 0;
603 
604  __m256 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos,
605  fones, fzeroes;
606  __m256 sine, cosine;
607  __m256i q, ones, twos, fours;
608 
609  m4pi = _mm256_set1_ps(1.273239544735162542821171882678754627704620361328125);
610  pio4A = _mm256_set1_ps(0.7853981554508209228515625);
611  pio4B = _mm256_set1_ps(0.794662735614792836713604629039764404296875e-8);
612  pio4C = _mm256_set1_ps(0.306161699786838294306516483068750264552437361480769e-16);
613  ffours = _mm256_set1_ps(4.0);
614  ftwos = _mm256_set1_ps(2.0);
615  fones = _mm256_set1_ps(1.0);
616  fzeroes = _mm256_setzero_ps();
617  __m256i zeroes = _mm256_set1_epi32(0);
618  ones = _mm256_set1_epi32(1);
619  __m256i allones = _mm256_set1_epi32(0xffffffff);
620  twos = _mm256_set1_epi32(2);
621  fours = _mm256_set1_epi32(4);
622 
623  cp1 = _mm256_set1_ps(1.0);
624  cp2 = _mm256_set1_ps(0.08333333333333333);
625  cp3 = _mm256_set1_ps(0.002777777777777778);
626  cp4 = _mm256_set1_ps(4.96031746031746e-05);
627  cp5 = _mm256_set1_ps(5.511463844797178e-07);
628  union bit256 condition1;
629  union bit256 condition3;
630 
631  for (; number < eighthPoints; number++) {
632 
633  aVal = _mm256_loadu_ps(aPtr);
634  // s = fabs(aVal)
635  s = _mm256_sub_ps(aVal,
636  _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
637  _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
638  // q = (int) (s * (4/pi)), floor(aVal / (pi/4))
639  q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
640  // r = q + q&1, q indicates quadrant, r gives
641  r = _mm256_cvtepi32_ps(_mm256_add_epi32(q, _mm256_and_si256(q, ones)));
642 
643  s = _mm256_fnmadd_ps(r, pio4A, s);
644  s = _mm256_fnmadd_ps(r, pio4B, s);
645  s = _mm256_fnmadd_ps(r, pio4C, s);
646 
647  s = _mm256_div_ps(
648  s,
649  _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
650  s = _mm256_mul_ps(s, s);
651  // Evaluate Taylor series
652  s = _mm256_mul_ps(
653  _mm256_fmadd_ps(
654  _mm256_fmsub_ps(
655  _mm256_fmadd_ps(_mm256_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2),
656  s,
657  cp1),
658  s);
659 
660  for (i = 0; i < 3; i++)
661  s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
662  s = _mm256_div_ps(s, ftwos);
663 
664  sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
665  cosine = _mm256_sub_ps(fones, s);
666 
667  // if(((q+1)&2) != 0) { cosine=sine;}
668  condition1.int_vec =
669  _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_add_epi32(q, ones), twos), zeroes);
670  condition1.int_vec = _mm256_xor_si256(allones, condition1.int_vec);
671 
672  // if(((q+2)&4) != 0) { cosine = -cosine;}
673  condition3.int_vec = _mm256_cmpeq_epi32(
674  _mm256_and_si256(_mm256_add_epi32(q, twos), fours), zeroes);
675  condition3.int_vec = _mm256_xor_si256(allones, condition3.int_vec);
676 
677  cosine = _mm256_add_ps(
678  cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1.float_vec));
679  cosine = _mm256_sub_ps(cosine,
680  _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)),
681  condition3.float_vec));
682  _mm256_storeu_ps(bPtr, cosine);
683  aPtr += 8;
684  bPtr += 8;
685  }
686 
687  number = eighthPoints * 8;
688  for (; number < num_points; number++) {
689  *bPtr++ = cos(*aPtr++);
690  }
691 }
692 
693 #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for unaligned */
694 
695 #ifdef LV_HAVE_AVX2
696 #include <immintrin.h>
697 
698 static inline void
699 volk_32f_cos_32f_u_avx2(float* bVector, const float* aVector, unsigned int num_points)
700 {
701  float* bPtr = bVector;
702  const float* aPtr = aVector;
703 
704  unsigned int number = 0;
705  unsigned int eighthPoints = num_points / 8;
706  unsigned int i = 0;
707 
708  __m256 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos,
709  fones, fzeroes;
710  __m256 sine, cosine;
711  __m256i q, ones, twos, fours;
712 
713  m4pi = _mm256_set1_ps(1.273239544735162542821171882678754627704620361328125);
714  pio4A = _mm256_set1_ps(0.7853981554508209228515625);
715  pio4B = _mm256_set1_ps(0.794662735614792836713604629039764404296875e-8);
716  pio4C = _mm256_set1_ps(0.306161699786838294306516483068750264552437361480769e-16);
717  ffours = _mm256_set1_ps(4.0);
718  ftwos = _mm256_set1_ps(2.0);
719  fones = _mm256_set1_ps(1.0);
720  fzeroes = _mm256_setzero_ps();
721  __m256i zeroes = _mm256_set1_epi32(0);
722  ones = _mm256_set1_epi32(1);
723  __m256i allones = _mm256_set1_epi32(0xffffffff);
724  twos = _mm256_set1_epi32(2);
725  fours = _mm256_set1_epi32(4);
726 
727  cp1 = _mm256_set1_ps(1.0);
728  cp2 = _mm256_set1_ps(0.08333333333333333);
729  cp3 = _mm256_set1_ps(0.002777777777777778);
730  cp4 = _mm256_set1_ps(4.96031746031746e-05);
731  cp5 = _mm256_set1_ps(5.511463844797178e-07);
732  union bit256 condition1;
733  union bit256 condition3;
734 
735  for (; number < eighthPoints; number++) {
736 
737  aVal = _mm256_loadu_ps(aPtr);
738  // s = fabs(aVal)
739  s = _mm256_sub_ps(aVal,
740  _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
741  _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
742  // q = (int) (s * (4/pi)), floor(aVal / (pi/4))
743  q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
744  // r = q + q&1, q indicates quadrant, r gives
745  r = _mm256_cvtepi32_ps(_mm256_add_epi32(q, _mm256_and_si256(q, ones)));
746 
747  s = _mm256_sub_ps(s, _mm256_mul_ps(r, pio4A));
748  s = _mm256_sub_ps(s, _mm256_mul_ps(r, pio4B));
749  s = _mm256_sub_ps(s, _mm256_mul_ps(r, pio4C));
750 
751  s = _mm256_div_ps(
752  s,
753  _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
754  s = _mm256_mul_ps(s, s);
755  // Evaluate Taylor series
756  s = _mm256_mul_ps(
757  _mm256_add_ps(
758  _mm256_mul_ps(
759  _mm256_sub_ps(
760  _mm256_mul_ps(
761  _mm256_add_ps(
762  _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(s, cp5), cp4),
763  s),
764  cp3),
765  s),
766  cp2),
767  s),
768  cp1),
769  s);
770 
771  for (i = 0; i < 3; i++)
772  s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
773  s = _mm256_div_ps(s, ftwos);
774 
775  sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
776  cosine = _mm256_sub_ps(fones, s);
777 
778  // if(((q+1)&2) != 0) { cosine=sine;}
779  condition1.int_vec =
780  _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_add_epi32(q, ones), twos), zeroes);
781  condition1.int_vec = _mm256_xor_si256(allones, condition1.int_vec);
782 
783  // if(((q+2)&4) != 0) { cosine = -cosine;}
784  condition3.int_vec = _mm256_cmpeq_epi32(
785  _mm256_and_si256(_mm256_add_epi32(q, twos), fours), zeroes);
786  condition3.int_vec = _mm256_xor_si256(allones, condition3.int_vec);
787 
788  cosine = _mm256_add_ps(
789  cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1.float_vec));
790  cosine = _mm256_sub_ps(cosine,
791  _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)),
792  condition3.float_vec));
793  _mm256_storeu_ps(bPtr, cosine);
794  aPtr += 8;
795  bPtr += 8;
796  }
797 
798  number = eighthPoints * 8;
799  for (; number < num_points; number++) {
800  *bPtr++ = cos(*aPtr++);
801  }
802 }
803 
804 #endif /* LV_HAVE_AVX2 for unaligned */
805 
806 #ifdef LV_HAVE_SSE4_1
807 #include <smmintrin.h>
808 
809 static inline void
810 volk_32f_cos_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
811 {
812  float* bPtr = bVector;
813  const float* aPtr = aVector;
814 
815  unsigned int number = 0;
816  unsigned int quarterPoints = num_points / 4;
817  unsigned int i = 0;
818 
819  __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
820  fzeroes;
821  __m128 sine, cosine, condition1, condition3;
822  __m128i q, r, ones, twos, fours;
823 
824  m4pi = _mm_set1_ps(1.273239545);
825  pio4A = _mm_set1_ps(0.78515625);
826  pio4B = _mm_set1_ps(0.241876e-3);
827  ffours = _mm_set1_ps(4.0);
828  ftwos = _mm_set1_ps(2.0);
829  fones = _mm_set1_ps(1.0);
830  fzeroes = _mm_setzero_ps();
831  ones = _mm_set1_epi32(1);
832  twos = _mm_set1_epi32(2);
833  fours = _mm_set1_epi32(4);
834 
835  cp1 = _mm_set1_ps(1.0);
836  cp2 = _mm_set1_ps(0.83333333e-1);
837  cp3 = _mm_set1_ps(0.2777778e-2);
838  cp4 = _mm_set1_ps(0.49603e-4);
839  cp5 = _mm_set1_ps(0.551e-6);
840 
841  for (; number < quarterPoints; number++) {
842  aVal = _mm_loadu_ps(aPtr);
843  s = _mm_sub_ps(aVal,
844  _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes)));
845  q = _mm_cvtps_epi32(_mm_floor_ps(_mm_mul_ps(s, m4pi)));
846  r = _mm_add_epi32(q, _mm_and_si128(q, ones));
847 
848  s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A));
849  s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B));
850 
851  s = _mm_div_ps(
852  s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
853  s = _mm_mul_ps(s, s);
854  // Evaluate Taylor series
855  s = _mm_mul_ps(
856  _mm_add_ps(
857  _mm_mul_ps(
858  _mm_sub_ps(
859  _mm_mul_ps(
860  _mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s),
861  cp3),
862  s),
863  cp2),
864  s),
865  cp1),
866  s);
867 
868  for (i = 0; i < 3; i++) {
869  s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
870  }
871  s = _mm_div_ps(s, ftwos);
872 
873  sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
874  cosine = _mm_sub_ps(fones, s);
875 
876  condition1 = _mm_cmpneq_ps(
877  _mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)), fzeroes);
878 
879  condition3 = _mm_cmpneq_ps(
880  _mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, twos), fours)), fzeroes);
881 
882  cosine = _mm_add_ps(cosine, _mm_and_ps(_mm_sub_ps(sine, cosine), condition1));
883  cosine = _mm_sub_ps(
884  cosine, _mm_and_ps(_mm_mul_ps(cosine, _mm_set1_ps(2.0f)), condition3));
885  _mm_storeu_ps(bPtr, cosine);
886  aPtr += 4;
887  bPtr += 4;
888  }
889 
890  number = quarterPoints * 4;
891  for (; number < num_points; number++) {
892  *bPtr++ = cosf(*aPtr++);
893  }
894 }
895 
896 #endif /* LV_HAVE_SSE4_1 for unaligned */
897 
898 
899 #ifdef LV_HAVE_GENERIC
900 
901 /*
902  * For derivation see
903  * Shibata, Naoki, "Efficient evaluation methods of elementary functions
904  * suitable for SIMD computation," in Springer-Verlag 2010
905  */
906 static inline void volk_32f_cos_32f_generic_fast(float* bVector,
907  const float* aVector,
908  unsigned int num_points)
909 {
910  float* bPtr = bVector;
911  const float* aPtr = aVector;
912 
913  float m4pi = 1.273239544735162542821171882678754627704620361328125;
914  float pio4A = 0.7853981554508209228515625;
915  float pio4B = 0.794662735614792836713604629039764404296875e-8;
916  float pio4C = 0.306161699786838294306516483068750264552437361480769e-16;
917  int N = 3; // order of argument reduction
918 
919  unsigned int number;
920  for (number = 0; number < num_points; number++) {
921  float s = fabs(*aPtr);
922  int q = (int)(s * m4pi);
923  int r = q + (q & 1);
924  s -= r * pio4A;
925  s -= r * pio4B;
926  s -= r * pio4C;
927 
928  s = s * 0.125; // 2^-N (<--3)
929  s = s * s;
930  s = ((((s / 1814400. - 1.0 / 20160.0) * s + 1.0 / 360.0) * s - 1.0 / 12.0) * s +
931  1.0) *
932  s;
933 
934  int i;
935  for (i = 0; i < N; ++i) {
936  s = (4.0 - s) * s;
937  }
938  s = s / 2.0;
939 
940  float sine = sqrt((2.0 - s) * s);
941  float cosine = 1 - s;
942 
943  if (((q + 1) & 2) != 0) {
944  s = cosine;
945  cosine = sine;
946  sine = s;
947  }
948  if (((q + 2) & 4) != 0) {
949  cosine = -cosine;
950  }
951  *bPtr = cosine;
952  bPtr++;
953  aPtr++;
954  }
955 }
956 
957 #endif /* LV_HAVE_GENERIC */
958 
959 
960 #ifdef LV_HAVE_GENERIC
961 
962 static inline void
963 volk_32f_cos_32f_generic(float* bVector, const float* aVector, unsigned int num_points)
964 {
965  float* bPtr = bVector;
966  const float* aPtr = aVector;
967  unsigned int number = 0;
968 
969  for (; number < num_points; number++) {
970  *bPtr++ = cosf(*aPtr++);
971  }
972 }
973 
974 #endif /* LV_HAVE_GENERIC */
975 
976 
977 #ifdef LV_HAVE_NEON
978 #include <arm_neon.h>
980 
981 static inline void
982 volk_32f_cos_32f_neon(float* bVector, const float* aVector, unsigned int num_points)
983 {
984  unsigned int number = 0;
985  unsigned int quarter_points = num_points / 4;
986  float* bVectorPtr = bVector;
987  const float* aVectorPtr = aVector;
988 
989  float32x4_t b_vec;
990  float32x4_t a_vec;
991 
992  for (number = 0; number < quarter_points; number++) {
993  a_vec = vld1q_f32(aVectorPtr);
994  // Prefetch next one, speeds things up
995  __VOLK_PREFETCH(aVectorPtr + 4);
996  b_vec = _vcosq_f32(a_vec);
997  vst1q_f32(bVectorPtr, b_vec);
998  // move pointers ahead
999  bVectorPtr += 4;
1000  aVectorPtr += 4;
1001  }
1002 
1003  // Deal with the rest
1004  for (number = quarter_points * 4; number < num_points; number++) {
1005  *bVectorPtr++ = cosf(*aVectorPtr++);
1006  }
1007 }
1008 
1009 #endif /* LV_HAVE_NEON */
1010 
1011 
1012 #endif /* INCLUDED_volk_32f_cos_32f_u_H */
Definition: volk_common.h:111
Definition: volk_common.h:128
static void volk_32f_cos_32f_generic(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_cos_32f.h:963
static void volk_32f_cos_32f_neon(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_cos_32f.h:982
static void volk_32f_cos_32f_generic_fast(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_cos_32f.h:906
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:62
for i
Definition: volk_config_fixed.tmpl.h:25
static float32x4_t _vcosq_f32(float32x4_t x)
Definition: volk_neon_intrinsics.h:269