Vector Optimized Library of Kernels  2.5.1
Architecture-tuned implementations of math kernels
volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014, 2019 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
79 #ifndef INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_H
80 #define INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_H
81 
82 #include <volk/volk_complex.h>
83 
84 
85 static inline void calculate_scaled_distances(float* target,
86  const lv_32fc_t symbol,
87  const lv_32fc_t* points,
88  const float scalar,
89  const unsigned int num_points)
90 {
91  lv_32fc_t diff;
92  for (unsigned int i = 0; i < num_points; ++i) {
93  /*
94  * Calculate: |y - x|^2 * SNR_lin
95  * Compare C++: *target++ = scalar * std::norm(symbol - *constellation++);
96  */
97  diff = symbol - *points++;
98  *target++ =
99  scalar * (lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff));
100  }
101 }
102 
103 
104 #ifdef LV_HAVE_AVX2
105 #include <immintrin.h>
107 
108 static inline void
109 volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_avx2(float* target,
110  lv_32fc_t* src0,
111  lv_32fc_t* points,
112  float scalar,
113  unsigned int num_points)
114 {
115  const unsigned int num_bytes = num_points * 8;
116  __m128 xmm9, xmm10;
117  __m256 xmm4, xmm6;
118  __m256 xmm_points0, xmm_points1, xmm_result;
119 
120  const unsigned int bound = num_bytes >> 6;
121 
122  // load complex value into all parts of the register.
123  const __m256 xmm_symbol = _mm256_castpd_ps(_mm256_broadcast_sd((const double*)src0));
124  const __m128 xmm128_symbol = _mm256_extractf128_ps(xmm_symbol, 1);
125 
126  // Load scalar into all 8 parts of the register
127  const __m256 xmm_scalar = _mm256_broadcast_ss(&scalar);
128  const __m128 xmm128_scalar = _mm256_extractf128_ps(xmm_scalar, 1);
129 
130  // Set permutation constant
131  const __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
132 
133  for (unsigned int i = 0; i < bound; ++i) {
134  xmm_points0 = _mm256_load_ps((float*)points);
135  xmm_points1 = _mm256_load_ps((float*)(points + 4));
136  points += 8;
137  __VOLK_PREFETCH(points);
138 
139  xmm_result = _mm256_scaled_norm_dist_ps_avx2(
140  xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar);
141 
142  _mm256_store_ps(target, xmm_result);
143  target += 8;
144  }
145 
146  if (num_bytes >> 5 & 1) {
147  xmm_points0 = _mm256_load_ps((float*)points);
148 
149  xmm4 = _mm256_sub_ps(xmm_symbol, xmm_points0);
150 
151  points += 4;
152 
153  xmm6 = _mm256_mul_ps(xmm4, xmm4);
154 
155  xmm4 = _mm256_hadd_ps(xmm6, xmm6);
156  xmm4 = _mm256_permutevar8x32_ps(xmm4, idx);
157 
158  xmm_result = _mm256_mul_ps(xmm4, xmm_scalar);
159 
160  xmm9 = _mm256_extractf128_ps(xmm_result, 1);
161  _mm_store_ps(target, xmm9);
162  target += 4;
163  }
164 
165  if (num_bytes >> 4 & 1) {
166  xmm9 = _mm_load_ps((float*)points);
167 
168  xmm10 = _mm_sub_ps(xmm128_symbol, xmm9);
169 
170  points += 2;
171 
172  xmm9 = _mm_mul_ps(xmm10, xmm10);
173 
174  xmm10 = _mm_hadd_ps(xmm9, xmm9);
175 
176  xmm10 = _mm_mul_ps(xmm10, xmm128_scalar);
177 
178  _mm_storeh_pi((__m64*)target, xmm10);
179  target += 2;
180  }
181 
182  calculate_scaled_distances(target, src0[0], points, scalar, (num_bytes >> 3) & 1);
183 }
184 
185 #endif /*LV_HAVE_AVX2*/
186 
187 
188 #ifdef LV_HAVE_AVX
189 #include <immintrin.h>
191 
192 static inline void
194  lv_32fc_t* src0,
195  lv_32fc_t* points,
196  float scalar,
197  unsigned int num_points)
198 {
199  const int eightsPoints = num_points / 8;
200  const int remainder = num_points - 8 * eightsPoints;
201 
202  __m256 xmm_points0, xmm_points1, xmm_result;
203 
204  // load complex value into all parts of the register.
205  const __m256 xmm_symbol = _mm256_castpd_ps(_mm256_broadcast_sd((const double*)src0));
206 
207  // Load scalar into all 8 parts of the register
208  const __m256 xmm_scalar = _mm256_broadcast_ss(&scalar);
209 
210  for (int i = 0; i < eightsPoints; ++i) {
211  xmm_points0 = _mm256_load_ps((float*)points);
212  xmm_points1 = _mm256_load_ps((float*)(points + 4));
213  points += 8;
214 
215  xmm_result = _mm256_scaled_norm_dist_ps(
216  xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar);
217 
218  _mm256_store_ps(target, xmm_result);
219  target += 8;
220  }
221 
222  const lv_32fc_t symbol = *src0;
223  calculate_scaled_distances(target, symbol, points, scalar, remainder);
224 }
225 
226 #endif /* LV_HAVE_AVX */
227 
228 
229 #ifdef LV_HAVE_SSE3
230 #include <pmmintrin.h>
232 
233 static inline void
235  lv_32fc_t* src0,
236  lv_32fc_t* points,
237  float scalar,
238  unsigned int num_points)
239 {
240  __m128 xmm_points0, xmm_points1, xmm_result;
241 
242  /*
243  * First do 4 values in every loop iteration.
244  * There may be up to 3 values left.
245  * leftovers0 indicates if at least 2 more are available for SSE execution.
246  * leftovers1 indicates if there is a single element left.
247  */
248  const int quarterPoints = num_points / 4;
249  const int leftovers0 = (num_points / 2) - 2 * quarterPoints;
250  const int leftovers1 = num_points % 2;
251 
252  // load complex value into both parts of the register.
253  const __m128 xmm_symbol = _mm_castpd_ps(_mm_load1_pd((const double*)src0));
254 
255  // Load scalar into all 4 parts of the register
256  const __m128 xmm_scalar = _mm_load1_ps(&scalar);
257 
258  for (int i = 0; i < quarterPoints; ++i) {
259  xmm_points0 = _mm_load_ps((float*)points);
260  xmm_points1 = _mm_load_ps((float*)(points + 2));
261  points += 4;
262  __VOLK_PREFETCH(points);
263  // calculate distances
264  xmm_result = _mm_scaled_norm_dist_ps_sse3(
265  xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar);
266 
267  _mm_store_ps(target, xmm_result);
268  target += 4;
269  }
270 
271  for (int i = 0; i < leftovers0; ++i) {
272  xmm_points0 = _mm_load_ps((float*)points);
273  points += 2;
274 
275  xmm_points0 = _mm_sub_ps(xmm_symbol, xmm_points0);
276  xmm_points0 = _mm_mul_ps(xmm_points0, xmm_points0);
277  xmm_points0 = _mm_hadd_ps(xmm_points0, xmm_points0);
278  xmm_result = _mm_mul_ps(xmm_points0, xmm_scalar);
279 
280  _mm_storeh_pi((__m64*)target, xmm_result);
281  target += 2;
282  }
283 
284  calculate_scaled_distances(target, src0[0], points, scalar, leftovers1);
285 }
286 
287 #endif /*LV_HAVE_SSE3*/
288 
289 #ifdef LV_HAVE_SSE
291 #include <xmmintrin.h>
292 static inline void
294  lv_32fc_t* src0,
295  lv_32fc_t* points,
296  float scalar,
297  unsigned int num_points)
298 {
299  const __m128 xmm_scalar = _mm_set1_ps(scalar);
300  const __m128 xmm_symbol = _mm_castpd_ps(_mm_load1_pd((const double*)src0));
301 
302  for (unsigned i = 0; i < num_points / 4; ++i) {
303  __m128 xmm_points0 = _mm_load_ps((float*)points);
304  __m128 xmm_points1 = _mm_load_ps((float*)(points + 2));
305  points += 4;
306  __m128 xmm_result = _mm_scaled_norm_dist_ps_sse(
307  xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar);
308  _mm_store_ps((float*)target, xmm_result);
309  target += 4;
310  }
311 
312  calculate_scaled_distances(target, src0[0], points, scalar, num_points % 4);
313 }
314 #endif // LV_HAVE_SSE
315 
316 #ifdef LV_HAVE_GENERIC
317 static inline void
319  lv_32fc_t* src0,
320  lv_32fc_t* points,
321  float scalar,
322  unsigned int num_points)
323 {
324  const lv_32fc_t symbol = *src0;
325  calculate_scaled_distances(target, symbol, points, scalar, num_points);
326 }
327 
328 #endif /*LV_HAVE_GENERIC*/
329 
330 
331 #endif /*INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_H*/
332 
333 #ifndef INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_H
334 #define INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_H
335 
336 #include <volk/volk_complex.h>
337 
338 
339 #ifdef LV_HAVE_AVX2
340 #include <immintrin.h>
342 
343 static inline void
344 volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_avx2(float* target,
345  lv_32fc_t* src0,
346  lv_32fc_t* points,
347  float scalar,
348  unsigned int num_points)
349 {
350  const unsigned int num_bytes = num_points * 8;
351  __m128 xmm9, xmm10;
352  __m256 xmm4, xmm6;
353  __m256 xmm_points0, xmm_points1, xmm_result;
354 
355  const unsigned int bound = num_bytes >> 6;
356 
357  // load complex value into all parts of the register.
358  const __m256 xmm_symbol = _mm256_castpd_ps(_mm256_broadcast_sd((const double*)src0));
359  const __m128 xmm128_symbol = _mm256_extractf128_ps(xmm_symbol, 1);
360 
361  // Load scalar into all 8 parts of the register
362  const __m256 xmm_scalar = _mm256_broadcast_ss(&scalar);
363  const __m128 xmm128_scalar = _mm256_extractf128_ps(xmm_scalar, 1);
364 
365  // Set permutation constant
366  const __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
367 
368  for (unsigned int i = 0; i < bound; ++i) {
369  xmm_points0 = _mm256_loadu_ps((float*)points);
370  xmm_points1 = _mm256_loadu_ps((float*)(points + 4));
371  points += 8;
372  __VOLK_PREFETCH(points);
373 
374  xmm_result = _mm256_scaled_norm_dist_ps_avx2(
375  xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar);
376 
377  _mm256_storeu_ps(target, xmm_result);
378  target += 8;
379  }
380 
381  if (num_bytes >> 5 & 1) {
382  xmm_points0 = _mm256_loadu_ps((float*)points);
383 
384  xmm4 = _mm256_sub_ps(xmm_symbol, xmm_points0);
385 
386  points += 4;
387 
388  xmm6 = _mm256_mul_ps(xmm4, xmm4);
389 
390  xmm4 = _mm256_hadd_ps(xmm6, xmm6);
391  xmm4 = _mm256_permutevar8x32_ps(xmm4, idx);
392 
393  xmm_result = _mm256_mul_ps(xmm4, xmm_scalar);
394 
395  xmm9 = _mm256_extractf128_ps(xmm_result, 1);
396  _mm_storeu_ps(target, xmm9);
397  target += 4;
398  }
399 
400  if (num_bytes >> 4 & 1) {
401  xmm9 = _mm_loadu_ps((float*)points);
402 
403  xmm10 = _mm_sub_ps(xmm128_symbol, xmm9);
404 
405  points += 2;
406 
407  xmm9 = _mm_mul_ps(xmm10, xmm10);
408 
409  xmm10 = _mm_hadd_ps(xmm9, xmm9);
410 
411  xmm10 = _mm_mul_ps(xmm10, xmm128_scalar);
412 
413  _mm_storeh_pi((__m64*)target, xmm10);
414  target += 2;
415  }
416 
417  calculate_scaled_distances(target, src0[0], points, scalar, (num_bytes >> 3) & 1);
418 }
419 
420 #endif /*LV_HAVE_AVX2*/
421 
422 
423 #ifdef LV_HAVE_AVX
424 #include <immintrin.h>
426 
427 static inline void
429  lv_32fc_t* src0,
430  lv_32fc_t* points,
431  float scalar,
432  unsigned int num_points)
433 {
434  const int eightsPoints = num_points / 8;
435  const int remainder = num_points - 8 * eightsPoints;
436 
437  __m256 xmm_points0, xmm_points1, xmm_result;
438 
439  // load complex value into all parts of the register.
440  const __m256 xmm_symbol = _mm256_castpd_ps(_mm256_broadcast_sd((const double*)src0));
441 
442  // Load scalar into all 8 parts of the register
443  const __m256 xmm_scalar = _mm256_broadcast_ss(&scalar);
444 
445  for (int i = 0; i < eightsPoints; ++i) {
446  xmm_points0 = _mm256_loadu_ps((float*)points);
447  xmm_points1 = _mm256_loadu_ps((float*)(points + 4));
448  points += 8;
449 
450  xmm_result = _mm256_scaled_norm_dist_ps(
451  xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar);
452 
453  _mm256_storeu_ps(target, xmm_result);
454  target += 8;
455  }
456 
457  const lv_32fc_t symbol = *src0;
458  calculate_scaled_distances(target, symbol, points, scalar, remainder);
459 }
460 
461 #endif /* LV_HAVE_AVX */
462 
463 
464 #ifdef LV_HAVE_SSE3
465 #include <pmmintrin.h>
467 
468 static inline void
470  lv_32fc_t* src0,
471  lv_32fc_t* points,
472  float scalar,
473  unsigned int num_points)
474 {
475  __m128 xmm_points0, xmm_points1, xmm_result;
476 
477  /*
478  * First do 4 values in every loop iteration.
479  * There may be up to 3 values left.
480  * leftovers0 indicates if at least 2 more are available for SSE execution.
481  * leftovers1 indicates if there is a single element left.
482  */
483  const int quarterPoints = num_points / 4;
484  const int leftovers0 = (num_points / 2) - 2 * quarterPoints;
485  const int leftovers1 = num_points % 2;
486 
487  // load complex value into both parts of the register.
488  const __m128 xmm_symbol = _mm_castpd_ps(_mm_load1_pd((const double*)src0));
489 
490  // Load scalar into all 4 parts of the register
491  const __m128 xmm_scalar = _mm_load1_ps(&scalar);
492 
493  for (int i = 0; i < quarterPoints; ++i) {
494  xmm_points0 = _mm_loadu_ps((float*)points);
495  xmm_points1 = _mm_loadu_ps((float*)(points + 2));
496  points += 4;
497  __VOLK_PREFETCH(points);
498  // calculate distances
499  xmm_result = _mm_scaled_norm_dist_ps_sse3(
500  xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar);
501 
502  _mm_storeu_ps(target, xmm_result);
503  target += 4;
504  }
505 
506  for (int i = 0; i < leftovers0; ++i) {
507  xmm_points0 = _mm_loadu_ps((float*)points);
508  points += 2;
509 
510  xmm_points0 = _mm_sub_ps(xmm_symbol, xmm_points0);
511  xmm_points0 = _mm_mul_ps(xmm_points0, xmm_points0);
512  xmm_points0 = _mm_hadd_ps(xmm_points0, xmm_points0);
513  xmm_result = _mm_mul_ps(xmm_points0, xmm_scalar);
514 
515  _mm_storeh_pi((__m64*)target, xmm_result);
516  target += 2;
517  }
518 
519  calculate_scaled_distances(target, src0[0], points, scalar, leftovers1);
520 }
521 
522 #endif /*LV_HAVE_SSE3*/
523 
524 #ifdef LV_HAVE_SSE
526 #include <xmmintrin.h>
527 static inline void
529  lv_32fc_t* src0,
530  lv_32fc_t* points,
531  float scalar,
532  unsigned int num_points)
533 {
534  const __m128 xmm_scalar = _mm_set1_ps(scalar);
535  const __m128 xmm_symbol = _mm_castpd_ps(_mm_load1_pd((const double*)src0));
536 
537  for (unsigned i = 0; i < num_points / 4; ++i) {
538  __m128 xmm_points0 = _mm_loadu_ps((float*)points);
539  __m128 xmm_points1 = _mm_loadu_ps((float*)(points + 2));
540  points += 4;
541  __m128 xmm_result = _mm_scaled_norm_dist_ps_sse(
542  xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar);
543  _mm_storeu_ps((float*)target, xmm_result);
544  target += 4;
545  }
546 
547  calculate_scaled_distances(target, src0[0], points, scalar, num_points % 4);
548 }
549 #endif // LV_HAVE_SSE
550 
551 #endif /*INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_H*/
static void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_avx(float *target, lv_32fc_t *src0, lv_32fc_t *points, float scalar, unsigned int num_points)
Definition: volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h:193
static void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_avx(float *target, lv_32fc_t *src0, lv_32fc_t *points, float scalar, unsigned int num_points)
Definition: volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h:428
static void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_sse(float *target, lv_32fc_t *src0, lv_32fc_t *points, float scalar, unsigned int num_points)
Definition: volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h:528
static void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_sse3(float *target, lv_32fc_t *src0, lv_32fc_t *points, float scalar, unsigned int num_points)
Definition: volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h:234
static void calculate_scaled_distances(float *target, const lv_32fc_t symbol, const lv_32fc_t *points, const float scalar, const unsigned int num_points)
Definition: volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h:85
static void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_sse(float *target, lv_32fc_t *src0, lv_32fc_t *points, float scalar, unsigned int num_points)
Definition: volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h:293
static void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_generic(float *target, lv_32fc_t *src0, lv_32fc_t *points, float scalar, unsigned int num_points)
Definition: volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h:318
static void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_sse3(float *target, lv_32fc_t *src0, lv_32fc_t *points, float scalar, unsigned int num_points)
Definition: volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h:469
static __m256 _mm256_scaled_norm_dist_ps_avx2(const __m256 symbols0, const __m256 symbols1, const __m256 points0, const __m256 points1, const __m256 scalar)
Definition: volk_avx2_intrinsics.h:105
static __m256 _mm256_scaled_norm_dist_ps(const __m256 symbols0, const __m256 symbols1, const __m256 points0, const __m256 points1, const __m256 scalar)
Definition: volk_avx_intrinsics.h:88
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:62
#define lv_cimag(x)
Definition: volk_complex.h:89
#define lv_creal(x)
Definition: volk_complex.h:87
float complex lv_32fc_t
Definition: volk_complex.h:65
for i
Definition: volk_config_fixed.tmpl.h:25
static __m128 _mm_scaled_norm_dist_ps_sse3(const __m128 symbols0, const __m128 symbols1, const __m128 points0, const __m128 points1, const __m128 scalar)
Definition: volk_sse3_intrinsics.h:63
static __m128 _mm_scaled_norm_dist_ps_sse(const __m128 symbols0, const __m128 symbols1, const __m128 points0, const __m128 points1, const __m128 scalar)
Definition: volk_sse_intrinsics.h:49