Vector Optimized Library of Kernels  2.5.1
Architecture-tuned implementations of math kernels
volk_32fc_x2_multiply_32fc.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
70 #ifndef INCLUDED_volk_32fc_x2_multiply_32fc_u_H
71 #define INCLUDED_volk_32fc_x2_multiply_32fc_u_H
72 
73 #include <float.h>
74 #include <inttypes.h>
75 #include <stdio.h>
76 #include <volk/volk_complex.h>
77 
78 #if LV_HAVE_AVX2 && LV_HAVE_FMA
79 #include <immintrin.h>
87 static inline void volk_32fc_x2_multiply_32fc_u_avx2_fma(lv_32fc_t* cVector,
88  const lv_32fc_t* aVector,
89  const lv_32fc_t* bVector,
90  unsigned int num_points)
91 {
92  unsigned int number = 0;
93  const unsigned int quarterPoints = num_points / 4;
94 
95  lv_32fc_t* c = cVector;
96  const lv_32fc_t* a = aVector;
97  const lv_32fc_t* b = bVector;
98 
99  for (; number < quarterPoints; number++) {
100 
101  const __m256 x =
102  _mm256_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
103  const __m256 y =
104  _mm256_loadu_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
105 
106  const __m256 yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr
107  const __m256 yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di
108 
109  const __m256 tmp2x = _mm256_permute_ps(x, 0xB1); // Re-arrange x to be ai,ar,bi,br
110 
111  const __m256 tmp2 = _mm256_mul_ps(tmp2x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
112 
113  const __m256 z = _mm256_fmaddsub_ps(
114  x, yl, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
115 
116  _mm256_storeu_ps((float*)c, z); // Store the results back into the C container
117 
118  a += 4;
119  b += 4;
120  c += 4;
121  }
122 
123  number = quarterPoints * 4;
124  for (; number < num_points; number++) {
125  *c++ = (*a++) * (*b++);
126  }
127 }
128 #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA */
129 
130 
131 #ifdef LV_HAVE_AVX
132 #include <immintrin.h>
134 
135 static inline void volk_32fc_x2_multiply_32fc_u_avx(lv_32fc_t* cVector,
136  const lv_32fc_t* aVector,
137  const lv_32fc_t* bVector,
138  unsigned int num_points)
139 {
140  unsigned int number = 0;
141  const unsigned int quarterPoints = num_points / 4;
142 
143  __m256 x, y, z;
144  lv_32fc_t* c = cVector;
145  const lv_32fc_t* a = aVector;
146  const lv_32fc_t* b = bVector;
147 
148  for (; number < quarterPoints; number++) {
149  x = _mm256_loadu_ps(
150  (float*)a); // Load the ar + ai, br + bi ... as ar,ai,br,bi ...
151  y = _mm256_loadu_ps(
152  (float*)b); // Load the cr + ci, dr + di ... as cr,ci,dr,di ...
153  z = _mm256_complexmul_ps(x, y);
154  _mm256_storeu_ps((float*)c, z); // Store the results back into the C container
155 
156  a += 4;
157  b += 4;
158  c += 4;
159  }
160 
161  number = quarterPoints * 4;
162 
163  for (; number < num_points; number++) {
164  *c++ = (*a++) * (*b++);
165  }
166 }
167 #endif /* LV_HAVE_AVX */
168 
169 
170 #ifdef LV_HAVE_SSE3
171 #include <pmmintrin.h>
173 
174 static inline void volk_32fc_x2_multiply_32fc_u_sse3(lv_32fc_t* cVector,
175  const lv_32fc_t* aVector,
176  const lv_32fc_t* bVector,
177  unsigned int num_points)
178 {
179  unsigned int number = 0;
180  const unsigned int halfPoints = num_points / 2;
181 
182  __m128 x, y, z;
183  lv_32fc_t* c = cVector;
184  const lv_32fc_t* a = aVector;
185  const lv_32fc_t* b = bVector;
186 
187  for (; number < halfPoints; number++) {
188  x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
189  y = _mm_loadu_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
190  z = _mm_complexmul_ps(x, y);
191  _mm_storeu_ps((float*)c, z); // Store the results back into the C container
192 
193  a += 2;
194  b += 2;
195  c += 2;
196  }
197 
198  if ((num_points % 2) != 0) {
199  *c = (*a) * (*b);
200  }
201 }
202 #endif /* LV_HAVE_SSE */
203 
204 
205 #ifdef LV_HAVE_GENERIC
206 
207 static inline void volk_32fc_x2_multiply_32fc_generic(lv_32fc_t* cVector,
208  const lv_32fc_t* aVector,
209  const lv_32fc_t* bVector,
210  unsigned int num_points)
211 {
212  lv_32fc_t* cPtr = cVector;
213  const lv_32fc_t* aPtr = aVector;
214  const lv_32fc_t* bPtr = bVector;
215  unsigned int number = 0;
216 
217  for (number = 0; number < num_points; number++) {
218  *cPtr++ = (*aPtr++) * (*bPtr++);
219  }
220 }
221 #endif /* LV_HAVE_GENERIC */
222 
223 
224 #endif /* INCLUDED_volk_32fc_x2_multiply_32fc_u_H */
225 #ifndef INCLUDED_volk_32fc_x2_multiply_32fc_a_H
226 #define INCLUDED_volk_32fc_x2_multiply_32fc_a_H
227 
228 #include <float.h>
229 #include <inttypes.h>
230 #include <stdio.h>
231 #include <volk/volk_complex.h>
232 
233 #if LV_HAVE_AVX2 && LV_HAVE_FMA
234 #include <immintrin.h>
242 static inline void volk_32fc_x2_multiply_32fc_a_avx2_fma(lv_32fc_t* cVector,
243  const lv_32fc_t* aVector,
244  const lv_32fc_t* bVector,
245  unsigned int num_points)
246 {
247  unsigned int number = 0;
248  const unsigned int quarterPoints = num_points / 4;
249 
250  lv_32fc_t* c = cVector;
251  const lv_32fc_t* a = aVector;
252  const lv_32fc_t* b = bVector;
253 
254  for (; number < quarterPoints; number++) {
255 
256  const __m256 x =
257  _mm256_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
258  const __m256 y =
259  _mm256_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
260 
261  const __m256 yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr
262  const __m256 yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di
263 
264  const __m256 tmp2x = _mm256_permute_ps(x, 0xB1); // Re-arrange x to be ai,ar,bi,br
265 
266  const __m256 tmp2 = _mm256_mul_ps(tmp2x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
267 
268  const __m256 z = _mm256_fmaddsub_ps(
269  x, yl, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
270 
271  _mm256_store_ps((float*)c, z); // Store the results back into the C container
272 
273  a += 4;
274  b += 4;
275  c += 4;
276  }
277 
278  number = quarterPoints * 4;
279  for (; number < num_points; number++) {
280  *c++ = (*a++) * (*b++);
281  }
282 }
283 #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA */
284 
285 
286 #ifdef LV_HAVE_AVX
287 #include <immintrin.h>
289 
290 static inline void volk_32fc_x2_multiply_32fc_a_avx(lv_32fc_t* cVector,
291  const lv_32fc_t* aVector,
292  const lv_32fc_t* bVector,
293  unsigned int num_points)
294 {
295  unsigned int number = 0;
296  const unsigned int quarterPoints = num_points / 4;
297 
298  __m256 x, y, z;
299  lv_32fc_t* c = cVector;
300  const lv_32fc_t* a = aVector;
301  const lv_32fc_t* b = bVector;
302 
303  for (; number < quarterPoints; number++) {
304  x = _mm256_load_ps((float*)a); // Load the ar + ai, br + bi ... as ar,ai,br,bi ...
305  y = _mm256_load_ps((float*)b); // Load the cr + ci, dr + di ... as cr,ci,dr,di ...
306  z = _mm256_complexmul_ps(x, y);
307  _mm256_store_ps((float*)c, z); // Store the results back into the C container
308 
309  a += 4;
310  b += 4;
311  c += 4;
312  }
313 
314  number = quarterPoints * 4;
315 
316  for (; number < num_points; number++) {
317  *c++ = (*a++) * (*b++);
318  }
319 }
320 #endif /* LV_HAVE_AVX */
321 
322 #ifdef LV_HAVE_SSE3
323 #include <pmmintrin.h>
325 
326 static inline void volk_32fc_x2_multiply_32fc_a_sse3(lv_32fc_t* cVector,
327  const lv_32fc_t* aVector,
328  const lv_32fc_t* bVector,
329  unsigned int num_points)
330 {
331  unsigned int number = 0;
332  const unsigned int halfPoints = num_points / 2;
333 
334  __m128 x, y, z;
335  lv_32fc_t* c = cVector;
336  const lv_32fc_t* a = aVector;
337  const lv_32fc_t* b = bVector;
338 
339  for (; number < halfPoints; number++) {
340  x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
341  y = _mm_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
342  z = _mm_complexmul_ps(x, y);
343  _mm_store_ps((float*)c, z); // Store the results back into the C container
344 
345  a += 2;
346  b += 2;
347  c += 2;
348  }
349 
350  if ((num_points % 2) != 0) {
351  *c = (*a) * (*b);
352  }
353 }
354 #endif /* LV_HAVE_SSE */
355 
356 
357 #ifdef LV_HAVE_GENERIC
358 
360  const lv_32fc_t* aVector,
361  const lv_32fc_t* bVector,
362  unsigned int num_points)
363 {
364  lv_32fc_t* cPtr = cVector;
365  const lv_32fc_t* aPtr = aVector;
366  const lv_32fc_t* bPtr = bVector;
367  unsigned int number = 0;
368 
369  for (number = 0; number < num_points; number++) {
370  *cPtr++ = (*aPtr++) * (*bPtr++);
371  }
372 }
373 #endif /* LV_HAVE_GENERIC */
374 
375 
376 #ifdef LV_HAVE_NEON
377 #include <arm_neon.h>
378 
379 static inline void volk_32fc_x2_multiply_32fc_neon(lv_32fc_t* cVector,
380  const lv_32fc_t* aVector,
381  const lv_32fc_t* bVector,
382  unsigned int num_points)
383 {
384  lv_32fc_t* a_ptr = (lv_32fc_t*)aVector;
385  lv_32fc_t* b_ptr = (lv_32fc_t*)bVector;
386  unsigned int quarter_points = num_points / 4;
387  float32x4x2_t a_val, b_val, c_val;
388  float32x4x2_t tmp_real, tmp_imag;
389  unsigned int number = 0;
390 
391  for (number = 0; number < quarter_points; ++number) {
392  a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
393  b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
394  __VOLK_PREFETCH(a_ptr + 4);
395  __VOLK_PREFETCH(b_ptr + 4);
396 
397  // multiply the real*real and imag*imag to get real result
398  // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r
399  tmp_real.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]);
400  // a0i*b0i|a1i*b1i|a2i*b2i|a3i*b3i
401  tmp_real.val[1] = vmulq_f32(a_val.val[1], b_val.val[1]);
402 
403  // Multiply cross terms to get the imaginary result
404  // a0r*b0i|a1r*b1i|a2r*b2i|a3r*b3i
405  tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[1]);
406  // a0i*b0r|a1i*b1r|a2i*b2r|a3i*b3r
407  tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]);
408 
409  // store the results
410  c_val.val[0] = vsubq_f32(tmp_real.val[0], tmp_real.val[1]);
411  c_val.val[1] = vaddq_f32(tmp_imag.val[0], tmp_imag.val[1]);
412  vst2q_f32((float*)cVector, c_val);
413 
414  a_ptr += 4;
415  b_ptr += 4;
416  cVector += 4;
417  }
418 
419  for (number = quarter_points * 4; number < num_points; number++) {
420  *cVector++ = (*a_ptr++) * (*b_ptr++);
421  }
422 }
423 #endif /* LV_HAVE_NEON */
424 
425 
426 #ifdef LV_HAVE_NEON
427 
429  const lv_32fc_t* aVector,
430  const lv_32fc_t* bVector,
431  unsigned int num_points)
432 {
433  lv_32fc_t* a_ptr = (lv_32fc_t*)aVector;
434  lv_32fc_t* b_ptr = (lv_32fc_t*)bVector;
435  unsigned int quarter_points = num_points / 4;
436  float32x4x2_t a_val, b_val;
437  float32x4x2_t tmp_imag;
438  unsigned int number = 0;
439 
440  for (number = 0; number < quarter_points; ++number) {
441  a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
442  b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
443  __VOLK_PREFETCH(a_ptr + 4);
444  __VOLK_PREFETCH(b_ptr + 4);
445 
446  // do the first multiply
447  tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]);
448  tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]);
449 
450  // use multiply accumulate/subtract to get result
451  tmp_imag.val[1] = vmlaq_f32(tmp_imag.val[1], a_val.val[0], b_val.val[1]);
452  tmp_imag.val[0] = vmlsq_f32(tmp_imag.val[0], a_val.val[1], b_val.val[1]);
453 
454  // store
455  vst2q_f32((float*)cVector, tmp_imag);
456  // increment pointers
457  a_ptr += 4;
458  b_ptr += 4;
459  cVector += 4;
460  }
461 
462  for (number = quarter_points * 4; number < num_points; number++) {
463  *cVector++ = (*a_ptr++) * (*b_ptr++);
464  }
465 }
466 #endif /* LV_HAVE_NEON */
467 
468 
469 #ifdef LV_HAVE_NEONV7
470 
471 extern void volk_32fc_x2_multiply_32fc_a_neonasm(lv_32fc_t* cVector,
472  const lv_32fc_t* aVector,
473  const lv_32fc_t* bVector,
474  unsigned int num_points);
475 #endif /* LV_HAVE_NEONV7 */
476 
477 
478 #ifdef LV_HAVE_ORC
479 
480 extern void volk_32fc_x2_multiply_32fc_a_orc_impl(lv_32fc_t* cVector,
481  const lv_32fc_t* aVector,
482  const lv_32fc_t* bVector,
483  unsigned int num_points);
484 
485 static inline void volk_32fc_x2_multiply_32fc_u_orc(lv_32fc_t* cVector,
486  const lv_32fc_t* aVector,
487  const lv_32fc_t* bVector,
488  unsigned int num_points)
489 {
490  volk_32fc_x2_multiply_32fc_a_orc_impl(cVector, aVector, bVector, num_points);
491 }
492 
493 #endif /* LV_HAVE_ORC */
494 
495 #endif /* INCLUDED_volk_32fc_x2_multiply_32fc_a_H */
static void volk_32fc_x2_multiply_32fc_a_sse3(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, unsigned int num_points)
Definition: volk_32fc_x2_multiply_32fc.h:326
static void volk_32fc_x2_multiply_32fc_u_avx(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, unsigned int num_points)
Definition: volk_32fc_x2_multiply_32fc.h:135
static void volk_32fc_x2_multiply_32fc_generic(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, unsigned int num_points)
Definition: volk_32fc_x2_multiply_32fc.h:207
static void volk_32fc_x2_multiply_32fc_neon_opttests(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, unsigned int num_points)
Definition: volk_32fc_x2_multiply_32fc.h:428
static void volk_32fc_x2_multiply_32fc_neon(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, unsigned int num_points)
Definition: volk_32fc_x2_multiply_32fc.h:379
static void volk_32fc_x2_multiply_32fc_a_avx(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, unsigned int num_points)
Definition: volk_32fc_x2_multiply_32fc.h:290
static void volk_32fc_x2_multiply_32fc_u_sse3(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, unsigned int num_points)
Definition: volk_32fc_x2_multiply_32fc.h:174
static void volk_32fc_x2_multiply_32fc_a_generic(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, unsigned int num_points)
Definition: volk_32fc_x2_multiply_32fc.h:359
static __m256 _mm256_complexmul_ps(__m256 x, __m256 y)
Definition: volk_avx_intrinsics.h:32
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:62
float complex lv_32fc_t
Definition: volk_complex.h:65
static __m128 _mm_complexmul_ps(__m128 x, __m128 y)
Definition: volk_sse3_intrinsics.h:32