76 #ifndef INCLUDED_volk_32f_sin_32f_a_H
77 #define INCLUDED_volk_32f_sin_32f_a_H
80 #if LV_HAVE_AVX2 && LV_HAVE_FMA
81 #include <immintrin.h>
84 volk_32f_sin_32f_a_avx2_fma(
float* bVector,
const float* aVector,
unsigned int num_points)
86 float* bPtr = bVector;
87 const float* aPtr = aVector;
89 unsigned int number = 0;
90 unsigned int eighthPoints = num_points / 8;
93 __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
95 __m256 sine, cosine, condition1, condition2;
96 __m256i q, r, ones, twos, fours;
98 m4pi = _mm256_set1_ps(1.273239545);
99 pio4A = _mm256_set1_ps(0.78515625);
100 pio4B = _mm256_set1_ps(0.241876e-3);
101 ffours = _mm256_set1_ps(4.0);
102 ftwos = _mm256_set1_ps(2.0);
103 fones = _mm256_set1_ps(1.0);
104 fzeroes = _mm256_setzero_ps();
105 ones = _mm256_set1_epi32(1);
106 twos = _mm256_set1_epi32(2);
107 fours = _mm256_set1_epi32(4);
109 cp1 = _mm256_set1_ps(1.0);
110 cp2 = _mm256_set1_ps(0.83333333e-1);
111 cp3 = _mm256_set1_ps(0.2777778e-2);
112 cp4 = _mm256_set1_ps(0.49603e-4);
113 cp5 = _mm256_set1_ps(0.551e-6);
115 for (; number < eighthPoints; number++) {
116 aVal = _mm256_load_ps(aPtr);
117 s = _mm256_sub_ps(aVal,
118 _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
119 _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
120 q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
121 r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
123 s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4A, s);
124 s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4B, s);
128 _mm256_set1_ps(8.0));
129 s = _mm256_mul_ps(s, s);
134 _mm256_fmadd_ps(_mm256_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2),
139 for (
i = 0;
i < 3;
i++) {
140 s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
142 s = _mm256_div_ps(s, ftwos);
144 sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
145 cosine = _mm256_sub_ps(fones, s);
147 condition1 = _mm256_cmp_ps(
148 _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)),
151 condition2 = _mm256_cmp_ps(
153 _mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ),
154 _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS),
161 _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(cosine, sine), condition1));
162 sine = _mm256_sub_ps(
163 sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
164 _mm256_store_ps(bPtr, sine);
169 number = eighthPoints * 8;
170 for (; number < num_points; number++) {
171 *bPtr++ = sin(*aPtr++);
178 #include <immintrin.h>
181 volk_32f_sin_32f_a_avx2(
float* bVector,
const float* aVector,
unsigned int num_points)
183 float* bPtr = bVector;
184 const float* aPtr = aVector;
186 unsigned int number = 0;
187 unsigned int eighthPoints = num_points / 8;
190 __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
192 __m256 sine, cosine, condition1, condition2;
193 __m256i q, r, ones, twos, fours;
195 m4pi = _mm256_set1_ps(1.273239545);
196 pio4A = _mm256_set1_ps(0.78515625);
197 pio4B = _mm256_set1_ps(0.241876e-3);
198 ffours = _mm256_set1_ps(4.0);
199 ftwos = _mm256_set1_ps(2.0);
200 fones = _mm256_set1_ps(1.0);
201 fzeroes = _mm256_setzero_ps();
202 ones = _mm256_set1_epi32(1);
203 twos = _mm256_set1_epi32(2);
204 fours = _mm256_set1_epi32(4);
206 cp1 = _mm256_set1_ps(1.0);
207 cp2 = _mm256_set1_ps(0.83333333e-1);
208 cp3 = _mm256_set1_ps(0.2777778e-2);
209 cp4 = _mm256_set1_ps(0.49603e-4);
210 cp5 = _mm256_set1_ps(0.551e-6);
212 for (; number < eighthPoints; number++) {
213 aVal = _mm256_load_ps(aPtr);
214 s = _mm256_sub_ps(aVal,
215 _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
216 _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
217 q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
218 r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
220 s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4A));
221 s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4B));
225 _mm256_set1_ps(8.0));
226 s = _mm256_mul_ps(s, s);
234 _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(s, cp5), cp4),
243 for (
i = 0;
i < 3;
i++) {
244 s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
246 s = _mm256_div_ps(s, ftwos);
248 sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
249 cosine = _mm256_sub_ps(fones, s);
251 condition1 = _mm256_cmp_ps(
252 _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)),
255 condition2 = _mm256_cmp_ps(
257 _mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ),
258 _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS),
265 _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(cosine, sine), condition1));
266 sine = _mm256_sub_ps(
267 sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
268 _mm256_store_ps(bPtr, sine);
273 number = eighthPoints * 8;
274 for (; number < num_points; number++) {
275 *bPtr++ = sin(*aPtr++);
281 #ifdef LV_HAVE_SSE4_1
282 #include <smmintrin.h>
285 volk_32f_sin_32f_a_sse4_1(
float* bVector,
const float* aVector,
unsigned int num_points)
287 float* bPtr = bVector;
288 const float* aPtr = aVector;
290 unsigned int number = 0;
291 unsigned int quarterPoints = num_points / 4;
294 __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
296 __m128 sine, cosine, condition1, condition2;
297 __m128i q, r, ones, twos, fours;
299 m4pi = _mm_set1_ps(1.273239545);
300 pio4A = _mm_set1_ps(0.78515625);
301 pio4B = _mm_set1_ps(0.241876e-3);
302 ffours = _mm_set1_ps(4.0);
303 ftwos = _mm_set1_ps(2.0);
304 fones = _mm_set1_ps(1.0);
305 fzeroes = _mm_setzero_ps();
306 ones = _mm_set1_epi32(1);
307 twos = _mm_set1_epi32(2);
308 fours = _mm_set1_epi32(4);
310 cp1 = _mm_set1_ps(1.0);
311 cp2 = _mm_set1_ps(0.83333333e-1);
312 cp3 = _mm_set1_ps(0.2777778e-2);
313 cp4 = _mm_set1_ps(0.49603e-4);
314 cp5 = _mm_set1_ps(0.551e-6);
316 for (; number < quarterPoints; number++) {
317 aVal = _mm_load_ps(aPtr);
319 _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes)));
320 q = _mm_cvtps_epi32(_mm_floor_ps(_mm_mul_ps(s, m4pi)));
321 r = _mm_add_epi32(q, _mm_and_si128(q, ones));
323 s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A));
324 s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B));
327 s, _mm_set1_ps(8.0));
328 s = _mm_mul_ps(s, s);
335 _mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s),
343 for (
i = 0;
i < 3;
i++) {
344 s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
346 s = _mm_div_ps(s, ftwos);
348 sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
349 cosine = _mm_sub_ps(fones, s);
351 condition1 = _mm_cmpneq_ps(
352 _mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)), fzeroes);
353 condition2 = _mm_cmpneq_ps(
354 _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(q, fours)), fzeroes),
355 _mm_cmplt_ps(aVal, fzeroes));
360 sine = _mm_add_ps(sine, _mm_and_ps(_mm_sub_ps(cosine, sine), condition1));
362 _mm_sub_ps(sine, _mm_and_ps(_mm_mul_ps(sine, _mm_set1_ps(2.0f)), condition2));
363 _mm_store_ps(bPtr, sine);
368 number = quarterPoints * 4;
369 for (; number < num_points; number++) {
370 *bPtr++ = sinf(*aPtr++);
379 #ifndef INCLUDED_volk_32f_sin_32f_u_H
380 #define INCLUDED_volk_32f_sin_32f_u_H
382 #if LV_HAVE_AVX2 && LV_HAVE_FMA
383 #include <immintrin.h>
386 volk_32f_sin_32f_u_avx2_fma(
float* bVector,
const float* aVector,
unsigned int num_points)
388 float* bPtr = bVector;
389 const float* aPtr = aVector;
391 unsigned int number = 0;
392 unsigned int eighthPoints = num_points / 8;
395 __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
397 __m256 sine, cosine, condition1, condition2;
398 __m256i q, r, ones, twos, fours;
400 m4pi = _mm256_set1_ps(1.273239545);
401 pio4A = _mm256_set1_ps(0.78515625);
402 pio4B = _mm256_set1_ps(0.241876e-3);
403 ffours = _mm256_set1_ps(4.0);
404 ftwos = _mm256_set1_ps(2.0);
405 fones = _mm256_set1_ps(1.0);
406 fzeroes = _mm256_setzero_ps();
407 ones = _mm256_set1_epi32(1);
408 twos = _mm256_set1_epi32(2);
409 fours = _mm256_set1_epi32(4);
411 cp1 = _mm256_set1_ps(1.0);
412 cp2 = _mm256_set1_ps(0.83333333e-1);
413 cp3 = _mm256_set1_ps(0.2777778e-2);
414 cp4 = _mm256_set1_ps(0.49603e-4);
415 cp5 = _mm256_set1_ps(0.551e-6);
417 for (; number < eighthPoints; number++) {
418 aVal = _mm256_loadu_ps(aPtr);
419 s = _mm256_sub_ps(aVal,
420 _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
421 _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
422 q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
423 r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
425 s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4A, s);
426 s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4B, s);
430 _mm256_set1_ps(8.0));
431 s = _mm256_mul_ps(s, s);
436 _mm256_fmadd_ps(_mm256_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2),
441 for (
i = 0;
i < 3;
i++) {
442 s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
444 s = _mm256_div_ps(s, ftwos);
446 sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
447 cosine = _mm256_sub_ps(fones, s);
449 condition1 = _mm256_cmp_ps(
450 _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)),
453 condition2 = _mm256_cmp_ps(
455 _mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ),
456 _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS),
463 _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(cosine, sine), condition1));
464 sine = _mm256_sub_ps(
465 sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
466 _mm256_storeu_ps(bPtr, sine);
471 number = eighthPoints * 8;
472 for (; number < num_points; number++) {
473 *bPtr++ = sin(*aPtr++);
480 #include <immintrin.h>
483 volk_32f_sin_32f_u_avx2(
float* bVector,
const float* aVector,
unsigned int num_points)
485 float* bPtr = bVector;
486 const float* aPtr = aVector;
488 unsigned int number = 0;
489 unsigned int eighthPoints = num_points / 8;
492 __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
494 __m256 sine, cosine, condition1, condition2;
495 __m256i q, r, ones, twos, fours;
497 m4pi = _mm256_set1_ps(1.273239545);
498 pio4A = _mm256_set1_ps(0.78515625);
499 pio4B = _mm256_set1_ps(0.241876e-3);
500 ffours = _mm256_set1_ps(4.0);
501 ftwos = _mm256_set1_ps(2.0);
502 fones = _mm256_set1_ps(1.0);
503 fzeroes = _mm256_setzero_ps();
504 ones = _mm256_set1_epi32(1);
505 twos = _mm256_set1_epi32(2);
506 fours = _mm256_set1_epi32(4);
508 cp1 = _mm256_set1_ps(1.0);
509 cp2 = _mm256_set1_ps(0.83333333e-1);
510 cp3 = _mm256_set1_ps(0.2777778e-2);
511 cp4 = _mm256_set1_ps(0.49603e-4);
512 cp5 = _mm256_set1_ps(0.551e-6);
514 for (; number < eighthPoints; number++) {
515 aVal = _mm256_loadu_ps(aPtr);
516 s = _mm256_sub_ps(aVal,
517 _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
518 _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
519 q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
520 r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
522 s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4A));
523 s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4B));
527 _mm256_set1_ps(8.0));
528 s = _mm256_mul_ps(s, s);
536 _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(s, cp5), cp4),
545 for (
i = 0;
i < 3;
i++) {
546 s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
548 s = _mm256_div_ps(s, ftwos);
550 sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
551 cosine = _mm256_sub_ps(fones, s);
553 condition1 = _mm256_cmp_ps(
554 _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)),
557 condition2 = _mm256_cmp_ps(
559 _mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ),
560 _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS),
567 _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(cosine, sine), condition1));
568 sine = _mm256_sub_ps(
569 sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
570 _mm256_storeu_ps(bPtr, sine);
575 number = eighthPoints * 8;
576 for (; number < num_points; number++) {
577 *bPtr++ = sin(*aPtr++);
584 #ifdef LV_HAVE_SSE4_1
585 #include <smmintrin.h>
588 volk_32f_sin_32f_u_sse4_1(
float* bVector,
const float* aVector,
unsigned int num_points)
590 float* bPtr = bVector;
591 const float* aPtr = aVector;
593 unsigned int number = 0;
594 unsigned int quarterPoints = num_points / 4;
597 __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
599 __m128 sine, cosine, condition1, condition2;
600 __m128i q, r, ones, twos, fours;
602 m4pi = _mm_set1_ps(1.273239545);
603 pio4A = _mm_set1_ps(0.78515625);
604 pio4B = _mm_set1_ps(0.241876e-3);
605 ffours = _mm_set1_ps(4.0);
606 ftwos = _mm_set1_ps(2.0);
607 fones = _mm_set1_ps(1.0);
608 fzeroes = _mm_setzero_ps();
609 ones = _mm_set1_epi32(1);
610 twos = _mm_set1_epi32(2);
611 fours = _mm_set1_epi32(4);
613 cp1 = _mm_set1_ps(1.0);
614 cp2 = _mm_set1_ps(0.83333333e-1);
615 cp3 = _mm_set1_ps(0.2777778e-2);
616 cp4 = _mm_set1_ps(0.49603e-4);
617 cp5 = _mm_set1_ps(0.551e-6);
619 for (; number < quarterPoints; number++) {
620 aVal = _mm_loadu_ps(aPtr);
622 _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes)));
623 q = _mm_cvtps_epi32(_mm_floor_ps(_mm_mul_ps(s, m4pi)));
624 r = _mm_add_epi32(q, _mm_and_si128(q, ones));
626 s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A));
627 s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B));
630 s, _mm_set1_ps(8.0));
631 s = _mm_mul_ps(s, s);
638 _mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s),
646 for (
i = 0;
i < 3;
i++) {
647 s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
649 s = _mm_div_ps(s, ftwos);
651 sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
652 cosine = _mm_sub_ps(fones, s);
654 condition1 = _mm_cmpneq_ps(
655 _mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)), fzeroes);
656 condition2 = _mm_cmpneq_ps(
657 _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(q, fours)), fzeroes),
658 _mm_cmplt_ps(aVal, fzeroes));
660 sine = _mm_add_ps(sine, _mm_and_ps(_mm_sub_ps(cosine, sine), condition1));
662 _mm_sub_ps(sine, _mm_and_ps(_mm_mul_ps(sine, _mm_set1_ps(2.0f)), condition2));
663 _mm_storeu_ps(bPtr, sine);
668 number = quarterPoints * 4;
669 for (; number < num_points; number++) {
670 *bPtr++ = sinf(*aPtr++);
677 #ifdef LV_HAVE_GENERIC
682 float* bPtr = bVector;
683 const float* aPtr = aVector;
684 unsigned int number = 0;
686 for (number = 0; number < num_points; number++) {
687 *bPtr++ = sinf(*aPtr++);
695 #include <arm_neon.h>
701 unsigned int number = 0;
702 unsigned int quarter_points = num_points / 4;
703 float* bVectorPtr = bVector;
704 const float* aVectorPtr = aVector;
709 for (number = 0; number < quarter_points; number++) {
710 a_vec = vld1q_f32(aVectorPtr);
714 vst1q_f32(bVectorPtr, b_vec);
721 for (number = quarter_points * 4; number < num_points; number++) {
722 *bVectorPtr++ = sinf(*aVectorPtr++);