Vector Optimized Library of Kernels  2.5.1
Architecture-tuned implementations of math kernels
volk_32f_asin_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
70 #include <inttypes.h>
71 #include <math.h>
72 #include <stdio.h>
73 
74 /* This is the number of terms of Taylor series to evaluate, increase this for more
75  * accuracy*/
76 #define ASIN_TERMS 2
77 
78 #ifndef INCLUDED_volk_32f_asin_32f_a_H
79 #define INCLUDED_volk_32f_asin_32f_a_H
80 
81 #if LV_HAVE_AVX2 && LV_HAVE_FMA
82 #include <immintrin.h>
83 
84 static inline void volk_32f_asin_32f_a_avx2_fma(float* bVector,
85  const float* aVector,
86  unsigned int num_points)
87 {
88  float* bPtr = bVector;
89  const float* aPtr = aVector;
90 
91  unsigned int number = 0;
92  unsigned int eighthPoints = num_points / 8;
93  int i, j;
94 
95  __m256 aVal, pio2, x, y, z, arcsine;
96  __m256 fzeroes, fones, ftwos, ffours, condition;
97 
98  pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
99  fzeroes = _mm256_setzero_ps();
100  fones = _mm256_set1_ps(1.0);
101  ftwos = _mm256_set1_ps(2.0);
102  ffours = _mm256_set1_ps(4.0);
103 
104  for (; number < eighthPoints; number++) {
105  aVal = _mm256_load_ps(aPtr);
106  aVal = _mm256_div_ps(aVal,
107  _mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal),
108  _mm256_sub_ps(fones, aVal))));
109  z = aVal;
110  condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
111  z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
112  condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
113  x = _mm256_add_ps(
114  z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
115 
116  for (i = 0; i < 2; i++) {
117  x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x, x, fones)));
118  }
119  x = _mm256_div_ps(fones, x);
120  y = fzeroes;
121  for (j = ASIN_TERMS - 1; j >= 0; j--) {
122  y = _mm256_fmadd_ps(
123  y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
124  }
125 
126  y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
127  condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
128 
129  y = _mm256_add_ps(y, _mm256_and_ps(_mm256_fnmadd_ps(y, ftwos, pio2), condition));
130  arcsine = y;
131  condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
132  arcsine = _mm256_sub_ps(arcsine,
133  _mm256_and_ps(_mm256_mul_ps(arcsine, ftwos), condition));
134 
135  _mm256_store_ps(bPtr, arcsine);
136  aPtr += 8;
137  bPtr += 8;
138  }
139 
140  number = eighthPoints * 8;
141  for (; number < num_points; number++) {
142  *bPtr++ = asin(*aPtr++);
143  }
144 }
145 
146 #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for aligned */
147 
148 
149 #ifdef LV_HAVE_AVX
150 #include <immintrin.h>
151 
152 static inline void
153 volk_32f_asin_32f_a_avx(float* bVector, const float* aVector, unsigned int num_points)
154 {
155  float* bPtr = bVector;
156  const float* aPtr = aVector;
157 
158  unsigned int number = 0;
159  unsigned int eighthPoints = num_points / 8;
160  int i, j;
161 
162  __m256 aVal, pio2, x, y, z, arcsine;
163  __m256 fzeroes, fones, ftwos, ffours, condition;
164 
165  pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
166  fzeroes = _mm256_setzero_ps();
167  fones = _mm256_set1_ps(1.0);
168  ftwos = _mm256_set1_ps(2.0);
169  ffours = _mm256_set1_ps(4.0);
170 
171  for (; number < eighthPoints; number++) {
172  aVal = _mm256_load_ps(aPtr);
173  aVal = _mm256_div_ps(aVal,
174  _mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal),
175  _mm256_sub_ps(fones, aVal))));
176  z = aVal;
177  condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
178  z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
179  condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
180  x = _mm256_add_ps(
181  z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
182 
183  for (i = 0; i < 2; i++) {
184  x = _mm256_add_ps(x,
185  _mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x))));
186  }
187  x = _mm256_div_ps(fones, x);
188  y = fzeroes;
189  for (j = ASIN_TERMS - 1; j >= 0; j--) {
190  y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)),
191  _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
192  }
193 
194  y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
195  condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
196 
197  y = _mm256_add_ps(
198  y, _mm256_and_ps(_mm256_sub_ps(pio2, _mm256_mul_ps(y, ftwos)), condition));
199  arcsine = y;
200  condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
201  arcsine = _mm256_sub_ps(arcsine,
202  _mm256_and_ps(_mm256_mul_ps(arcsine, ftwos), condition));
203 
204  _mm256_store_ps(bPtr, arcsine);
205  aPtr += 8;
206  bPtr += 8;
207  }
208 
209  number = eighthPoints * 8;
210  for (; number < num_points; number++) {
211  *bPtr++ = asin(*aPtr++);
212  }
213 }
214 
215 #endif /* LV_HAVE_AVX for aligned */
216 
217 #ifdef LV_HAVE_SSE4_1
218 #include <smmintrin.h>
219 
220 static inline void
221 volk_32f_asin_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
222 {
223  float* bPtr = bVector;
224  const float* aPtr = aVector;
225 
226  unsigned int number = 0;
227  unsigned int quarterPoints = num_points / 4;
228  int i, j;
229 
230  __m128 aVal, pio2, x, y, z, arcsine;
231  __m128 fzeroes, fones, ftwos, ffours, condition;
232 
233  pio2 = _mm_set1_ps(3.14159265358979323846 / 2);
234  fzeroes = _mm_setzero_ps();
235  fones = _mm_set1_ps(1.0);
236  ftwos = _mm_set1_ps(2.0);
237  ffours = _mm_set1_ps(4.0);
238 
239  for (; number < quarterPoints; number++) {
240  aVal = _mm_load_ps(aPtr);
241  aVal = _mm_div_ps(
242  aVal,
243  _mm_sqrt_ps(_mm_mul_ps(_mm_add_ps(fones, aVal), _mm_sub_ps(fones, aVal))));
244  z = aVal;
245  condition = _mm_cmplt_ps(z, fzeroes);
246  z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
247  condition = _mm_cmplt_ps(z, fones);
248  x = _mm_add_ps(z, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition));
249 
250  for (i = 0; i < 2; i++) {
251  x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
252  }
253  x = _mm_div_ps(fones, x);
254  y = fzeroes;
255  for (j = ASIN_TERMS - 1; j >= 0; j--) {
256  y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)),
257  _mm_set1_ps(pow(-1, j) / (2 * j + 1)));
258  }
259 
260  y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
261  condition = _mm_cmpgt_ps(z, fones);
262 
263  y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition));
264  arcsine = y;
265  condition = _mm_cmplt_ps(aVal, fzeroes);
266  arcsine = _mm_sub_ps(arcsine, _mm_and_ps(_mm_mul_ps(arcsine, ftwos), condition));
267 
268  _mm_store_ps(bPtr, arcsine);
269  aPtr += 4;
270  bPtr += 4;
271  }
272 
273  number = quarterPoints * 4;
274  for (; number < num_points; number++) {
275  *bPtr++ = asinf(*aPtr++);
276  }
277 }
278 
279 #endif /* LV_HAVE_SSE4_1 for aligned */
280 
281 #endif /* INCLUDED_volk_32f_asin_32f_a_H */
282 
283 #ifndef INCLUDED_volk_32f_asin_32f_u_H
284 #define INCLUDED_volk_32f_asin_32f_u_H
285 
286 #if LV_HAVE_AVX2 && LV_HAVE_FMA
287 #include <immintrin.h>
288 
289 static inline void volk_32f_asin_32f_u_avx2_fma(float* bVector,
290  const float* aVector,
291  unsigned int num_points)
292 {
293  float* bPtr = bVector;
294  const float* aPtr = aVector;
295 
296  unsigned int number = 0;
297  unsigned int eighthPoints = num_points / 8;
298  int i, j;
299 
300  __m256 aVal, pio2, x, y, z, arcsine;
301  __m256 fzeroes, fones, ftwos, ffours, condition;
302 
303  pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
304  fzeroes = _mm256_setzero_ps();
305  fones = _mm256_set1_ps(1.0);
306  ftwos = _mm256_set1_ps(2.0);
307  ffours = _mm256_set1_ps(4.0);
308 
309  for (; number < eighthPoints; number++) {
310  aVal = _mm256_loadu_ps(aPtr);
311  aVal = _mm256_div_ps(aVal,
312  _mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal),
313  _mm256_sub_ps(fones, aVal))));
314  z = aVal;
315  condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
316  z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
317  condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
318  x = _mm256_add_ps(
319  z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
320 
321  for (i = 0; i < 2; i++) {
322  x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x, x, fones)));
323  }
324  x = _mm256_div_ps(fones, x);
325  y = fzeroes;
326  for (j = ASIN_TERMS - 1; j >= 0; j--) {
327  y = _mm256_fmadd_ps(
328  y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
329  }
330 
331  y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
332  condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
333 
334  y = _mm256_add_ps(y, _mm256_and_ps(_mm256_fnmadd_ps(y, ftwos, pio2), condition));
335  arcsine = y;
336  condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
337  arcsine = _mm256_sub_ps(arcsine,
338  _mm256_and_ps(_mm256_mul_ps(arcsine, ftwos), condition));
339 
340  _mm256_storeu_ps(bPtr, arcsine);
341  aPtr += 8;
342  bPtr += 8;
343  }
344 
345  number = eighthPoints * 8;
346  for (; number < num_points; number++) {
347  *bPtr++ = asin(*aPtr++);
348  }
349 }
350 
351 #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for unaligned */
352 
353 
354 #ifdef LV_HAVE_AVX
355 #include <immintrin.h>
356 
357 static inline void
358 volk_32f_asin_32f_u_avx(float* bVector, const float* aVector, unsigned int num_points)
359 {
360  float* bPtr = bVector;
361  const float* aPtr = aVector;
362 
363  unsigned int number = 0;
364  unsigned int eighthPoints = num_points / 8;
365  int i, j;
366 
367  __m256 aVal, pio2, x, y, z, arcsine;
368  __m256 fzeroes, fones, ftwos, ffours, condition;
369 
370  pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
371  fzeroes = _mm256_setzero_ps();
372  fones = _mm256_set1_ps(1.0);
373  ftwos = _mm256_set1_ps(2.0);
374  ffours = _mm256_set1_ps(4.0);
375 
376  for (; number < eighthPoints; number++) {
377  aVal = _mm256_loadu_ps(aPtr);
378  aVal = _mm256_div_ps(aVal,
379  _mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal),
380  _mm256_sub_ps(fones, aVal))));
381  z = aVal;
382  condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
383  z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
384  condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
385  x = _mm256_add_ps(
386  z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
387 
388  for (i = 0; i < 2; i++) {
389  x = _mm256_add_ps(x,
390  _mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x))));
391  }
392  x = _mm256_div_ps(fones, x);
393  y = fzeroes;
394  for (j = ASIN_TERMS - 1; j >= 0; j--) {
395  y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)),
396  _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
397  }
398 
399  y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
400  condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
401 
402  y = _mm256_add_ps(
403  y, _mm256_and_ps(_mm256_sub_ps(pio2, _mm256_mul_ps(y, ftwos)), condition));
404  arcsine = y;
405  condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
406  arcsine = _mm256_sub_ps(arcsine,
407  _mm256_and_ps(_mm256_mul_ps(arcsine, ftwos), condition));
408 
409  _mm256_storeu_ps(bPtr, arcsine);
410  aPtr += 8;
411  bPtr += 8;
412  }
413 
414  number = eighthPoints * 8;
415  for (; number < num_points; number++) {
416  *bPtr++ = asin(*aPtr++);
417  }
418 }
419 
420 #endif /* LV_HAVE_AVX for unaligned */
421 
422 
423 #ifdef LV_HAVE_SSE4_1
424 #include <smmintrin.h>
425 
426 static inline void
427 volk_32f_asin_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
428 {
429  float* bPtr = bVector;
430  const float* aPtr = aVector;
431 
432  unsigned int number = 0;
433  unsigned int quarterPoints = num_points / 4;
434  int i, j;
435 
436  __m128 aVal, pio2, x, y, z, arcsine;
437  __m128 fzeroes, fones, ftwos, ffours, condition;
438 
439  pio2 = _mm_set1_ps(3.14159265358979323846 / 2);
440  fzeroes = _mm_setzero_ps();
441  fones = _mm_set1_ps(1.0);
442  ftwos = _mm_set1_ps(2.0);
443  ffours = _mm_set1_ps(4.0);
444 
445  for (; number < quarterPoints; number++) {
446  aVal = _mm_loadu_ps(aPtr);
447  aVal = _mm_div_ps(
448  aVal,
449  _mm_sqrt_ps(_mm_mul_ps(_mm_add_ps(fones, aVal), _mm_sub_ps(fones, aVal))));
450  z = aVal;
451  condition = _mm_cmplt_ps(z, fzeroes);
452  z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
453  condition = _mm_cmplt_ps(z, fones);
454  x = _mm_add_ps(z, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition));
455 
456  for (i = 0; i < 2; i++) {
457  x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
458  }
459  x = _mm_div_ps(fones, x);
460  y = fzeroes;
461  for (j = ASIN_TERMS - 1; j >= 0; j--) {
462  y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)),
463  _mm_set1_ps(pow(-1, j) / (2 * j + 1)));
464  }
465 
466  y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
467  condition = _mm_cmpgt_ps(z, fones);
468 
469  y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition));
470  arcsine = y;
471  condition = _mm_cmplt_ps(aVal, fzeroes);
472  arcsine = _mm_sub_ps(arcsine, _mm_and_ps(_mm_mul_ps(arcsine, ftwos), condition));
473 
474  _mm_storeu_ps(bPtr, arcsine);
475  aPtr += 4;
476  bPtr += 4;
477  }
478 
479  number = quarterPoints * 4;
480  for (; number < num_points; number++) {
481  *bPtr++ = asinf(*aPtr++);
482  }
483 }
484 
485 #endif /* LV_HAVE_SSE4_1 for unaligned */
486 
487 #ifdef LV_HAVE_GENERIC
488 
489 static inline void
490 volk_32f_asin_32f_u_generic(float* bVector, const float* aVector, unsigned int num_points)
491 {
492  float* bPtr = bVector;
493  const float* aPtr = aVector;
494  unsigned int number = 0;
495 
496  for (number = 0; number < num_points; number++) {
497  *bPtr++ = asinf(*aPtr++);
498  }
499 }
500 #endif /* LV_HAVE_GENERIC */
501 
502 #endif /* INCLUDED_volk_32f_asin_32f_u_H */
#define ASIN_TERMS
Definition: volk_32f_asin_32f.h:76
static void volk_32f_asin_32f_u_avx(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_asin_32f.h:358
static void volk_32f_asin_32f_a_avx(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_asin_32f.h:153
static void volk_32f_asin_32f_u_generic(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_asin_32f.h:490
for i
Definition: volk_config_fixed.tmpl.h:25