Vector Optimized Library of Kernels  2.5.1
Architecture-tuned implementations of math kernels
volk_32fc_index_min_32u.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2021 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * VOLK is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * VOLK is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
70 #ifndef INCLUDED_volk_32fc_index_min_32u_a_H
71 #define INCLUDED_volk_32fc_index_min_32u_a_H
72 
73 #include <inttypes.h>
74 #include <stdio.h>
75 #include <volk/volk_common.h>
76 #include <volk/volk_complex.h>
77 
78 #ifdef LV_HAVE_AVX2
79 #include <immintrin.h>
81 
82 static inline void volk_32fc_index_min_32u_a_avx2_variant_0(uint32_t* target,
83  const lv_32fc_t* source,
84  uint32_t num_points)
85 {
86  const __m256i indices_increment = _mm256_set1_epi32(8);
87  /*
88  * At the start of each loop iteration current_indices holds the indices of
89  * the complex numbers loaded from memory. Explanation for odd order is given
90  * in implementation of vector_32fc_index_min_variant0().
91  */
92  __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
93 
94  __m256 min_values = _mm256_set1_ps(FLT_MAX);
95  __m256i min_indices = _mm256_setzero_si256();
96 
97  for (unsigned i = 0; i < num_points / 8u; ++i) {
98  __m256 in0 = _mm256_load_ps((float*)source);
99  __m256 in1 = _mm256_load_ps((float*)(source + 4));
101  in0, in1, &min_values, &min_indices, &current_indices, indices_increment);
102  source += 8;
103  }
104 
105  // determine minimum value and index in the result of the vectorized loop
106  __VOLK_ATTR_ALIGNED(32) float min_values_buffer[8];
107  __VOLK_ATTR_ALIGNED(32) uint32_t min_indices_buffer[8];
108  _mm256_store_ps(min_values_buffer, min_values);
109  _mm256_store_si256((__m256i*)min_indices_buffer, min_indices);
110 
111  float min = FLT_MAX;
112  uint32_t index = 0;
113  for (unsigned i = 0; i < 8; i++) {
114  if (min_values_buffer[i] < min) {
115  min = min_values_buffer[i];
116  index = min_indices_buffer[i];
117  }
118  }
119 
120  // handle tail not processed by the vectorized loop
121  for (unsigned i = num_points & (~7u); i < num_points; ++i) {
122  const float abs_squared =
123  lv_creal(*source) * lv_creal(*source) + lv_cimag(*source) * lv_cimag(*source);
124  if (abs_squared < min) {
125  min = abs_squared;
126  index = i;
127  }
128  ++source;
129  }
130 
131  *target = index;
132 }
133 
134 #endif /*LV_HAVE_AVX2*/
135 
136 #ifdef LV_HAVE_AVX2
137 #include <immintrin.h>
139 
140 static inline void volk_32fc_index_min_32u_a_avx2_variant_1(uint32_t* target,
141  const lv_32fc_t* source,
142  uint32_t num_points)
143 {
144  const __m256i indices_increment = _mm256_set1_epi32(8);
145  /*
146  * At the start of each loop iteration current_indices holds the indices of
147  * the complex numbers loaded from memory. Explanation for odd order is given
148  * in implementation of vector_32fc_index_min_variant0().
149  */
150  __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
151 
152  __m256 min_values = _mm256_set1_ps(FLT_MAX);
153  __m256i min_indices = _mm256_setzero_si256();
154 
155  for (unsigned i = 0; i < num_points / 8u; ++i) {
156  __m256 in0 = _mm256_load_ps((float*)source);
157  __m256 in1 = _mm256_load_ps((float*)(source + 4));
159  in0, in1, &min_values, &min_indices, &current_indices, indices_increment);
160  source += 8;
161  }
162 
163  // determine minimum value and index in the result of the vectorized loop
164  __VOLK_ATTR_ALIGNED(32) float min_values_buffer[8];
165  __VOLK_ATTR_ALIGNED(32) uint32_t min_indices_buffer[8];
166  _mm256_store_ps(min_values_buffer, min_values);
167  _mm256_store_si256((__m256i*)min_indices_buffer, min_indices);
168 
169  float min = FLT_MAX;
170  uint32_t index = 0;
171  for (unsigned i = 0; i < 8; i++) {
172  if (min_values_buffer[i] < min) {
173  min = min_values_buffer[i];
174  index = min_indices_buffer[i];
175  }
176  }
177 
178  // handle tail not processed by the vectorized loop
179  for (unsigned i = num_points & (~7u); i < num_points; ++i) {
180  const float abs_squared =
181  lv_creal(*source) * lv_creal(*source) + lv_cimag(*source) * lv_cimag(*source);
182  if (abs_squared < min) {
183  min = abs_squared;
184  index = i;
185  }
186  ++source;
187  }
188 
189  *target = index;
190 }
191 
192 #endif /*LV_HAVE_AVX2*/
193 
194 #ifdef LV_HAVE_SSE3
195 #include <pmmintrin.h>
196 #include <xmmintrin.h>
197 
198 static inline void volk_32fc_index_min_32u_a_sse3(uint32_t* target,
199  const lv_32fc_t* source,
200  uint32_t num_points)
201 {
202  union bit128 holderf;
203  union bit128 holderi;
204  float sq_dist = 0.0;
205 
206  union bit128 xmm5, xmm4;
207  __m128 xmm1, xmm2, xmm3;
208  __m128i xmm8, xmm11, xmm12, xmm9, xmm10;
209 
210  xmm5.int_vec = _mm_setzero_si128();
211  xmm4.int_vec = _mm_setzero_si128();
212  holderf.int_vec = _mm_setzero_si128();
213  holderi.int_vec = _mm_setzero_si128();
214 
215  xmm8 = _mm_setr_epi32(0, 1, 2, 3);
216  xmm9 = _mm_setzero_si128();
217  xmm10 = _mm_setr_epi32(4, 4, 4, 4);
218  xmm3 = _mm_set_ps1(FLT_MAX);
219 
220  int bound = num_points >> 2;
221 
222  for (int i = 0; i < bound; ++i) {
223  xmm1 = _mm_load_ps((float*)source);
224  xmm2 = _mm_load_ps((float*)&source[2]);
225 
226  source += 4;
227 
228  xmm1 = _mm_mul_ps(xmm1, xmm1);
229  xmm2 = _mm_mul_ps(xmm2, xmm2);
230 
231  xmm1 = _mm_hadd_ps(xmm1, xmm2);
232 
233  xmm3 = _mm_min_ps(xmm1, xmm3);
234 
235  xmm4.float_vec = _mm_cmpgt_ps(xmm1, xmm3);
236  xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
237 
238  xmm11 = _mm_and_si128(xmm8, xmm5.int_vec);
239  xmm12 = _mm_and_si128(xmm9, xmm4.int_vec);
240 
241  xmm9 = _mm_add_epi32(xmm11, xmm12);
242 
243  xmm8 = _mm_add_epi32(xmm8, xmm10);
244  }
245 
246  if (num_points >> 1 & 1) {
247  xmm2 = _mm_load_ps((float*)source);
248 
249  xmm1 = _mm_movelh_ps(bit128_p(&xmm8)->float_vec, bit128_p(&xmm8)->float_vec);
250  xmm8 = bit128_p(&xmm1)->int_vec;
251 
252  xmm2 = _mm_mul_ps(xmm2, xmm2);
253 
254  source += 2;
255 
256  xmm1 = _mm_hadd_ps(xmm2, xmm2);
257 
258  xmm3 = _mm_min_ps(xmm1, xmm3);
259 
260  xmm10 = _mm_setr_epi32(2, 2, 2, 2);
261 
262  xmm4.float_vec = _mm_cmpgt_ps(xmm1, xmm3);
263  xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
264 
265  xmm11 = _mm_and_si128(xmm8, xmm5.int_vec);
266  xmm12 = _mm_and_si128(xmm9, xmm4.int_vec);
267 
268  xmm9 = _mm_add_epi32(xmm11, xmm12);
269 
270  xmm8 = _mm_add_epi32(xmm8, xmm10);
271  }
272 
273  if (num_points & 1) {
274  sq_dist = lv_creal(source[0]) * lv_creal(source[0]) +
275  lv_cimag(source[0]) * lv_cimag(source[0]);
276 
277  xmm2 = _mm_load1_ps(&sq_dist);
278 
279  xmm1 = xmm3;
280 
281  xmm3 = _mm_min_ss(xmm3, xmm2);
282 
283  xmm4.float_vec = _mm_cmpgt_ps(xmm1, xmm3);
284  xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
285 
286  xmm8 = _mm_shuffle_epi32(xmm8, 0x00);
287 
288  xmm11 = _mm_and_si128(xmm8, xmm4.int_vec);
289  xmm12 = _mm_and_si128(xmm9, xmm5.int_vec);
290 
291  xmm9 = _mm_add_epi32(xmm11, xmm12);
292  }
293 
294  _mm_store_ps((float*)&(holderf.f), xmm3);
295  _mm_store_si128(&(holderi.int_vec), xmm9);
296 
297  target[0] = holderi.i[0];
298  sq_dist = holderf.f[0];
299  target[0] = (holderf.f[1] < sq_dist) ? holderi.i[1] : target[0];
300  sq_dist = (holderf.f[1] < sq_dist) ? holderf.f[1] : sq_dist;
301  target[0] = (holderf.f[2] < sq_dist) ? holderi.i[2] : target[0];
302  sq_dist = (holderf.f[2] < sq_dist) ? holderf.f[2] : sq_dist;
303  target[0] = (holderf.f[3] < sq_dist) ? holderi.i[3] : target[0];
304  sq_dist = (holderf.f[3] < sq_dist) ? holderf.f[3] : sq_dist;
305 }
306 
307 #endif /*LV_HAVE_SSE3*/
308 
309 #ifdef LV_HAVE_GENERIC
310 static inline void volk_32fc_index_min_32u_generic(uint32_t* target,
311  const lv_32fc_t* source,
312  uint32_t num_points)
313 {
314  float sq_dist = 0.0;
315  float min = FLT_MAX;
316  uint32_t index = 0;
317 
318  for (uint32_t i = 0; i < num_points; ++i) {
319  sq_dist = lv_creal(source[i]) * lv_creal(source[i]) +
320  lv_cimag(source[i]) * lv_cimag(source[i]);
321 
322  if (sq_dist < min) {
323  index = i;
324  min = sq_dist;
325  }
326  }
327  target[0] = index;
328 }
329 
330 #endif /*LV_HAVE_GENERIC*/
331 
332 #endif /*INCLUDED_volk_32fc_index_min_32u_a_H*/
333 
334 #ifndef INCLUDED_volk_32fc_index_min_32u_u_H
335 #define INCLUDED_volk_32fc_index_min_32u_u_H
336 
337 #include <inttypes.h>
338 #include <stdio.h>
339 #include <volk/volk_common.h>
340 #include <volk/volk_complex.h>
341 
342 #ifdef LV_HAVE_AVX2
343 #include <immintrin.h>
345 
346 static inline void volk_32fc_index_min_32u_u_avx2_variant_0(uint32_t* target,
347  const lv_32fc_t* source,
348  uint32_t num_points)
349 {
350  const __m256i indices_increment = _mm256_set1_epi32(8);
351  /*
352  * At the start of each loop iteration current_indices holds the indices of
353  * the complex numbers loaded from memory. Explanation for odd order is given
354  * in implementation of vector_32fc_index_min_variant0().
355  */
356  __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
357 
358  __m256 min_values = _mm256_set1_ps(FLT_MAX);
359  __m256i min_indices = _mm256_setzero_si256();
360 
361  for (unsigned i = 0; i < num_points / 8u; ++i) {
362  __m256 in0 = _mm256_loadu_ps((float*)source);
363  __m256 in1 = _mm256_loadu_ps((float*)(source + 4));
365  in0, in1, &min_values, &min_indices, &current_indices, indices_increment);
366  source += 8;
367  }
368 
369  // determine minimum value and index in the result of the vectorized loop
370  __VOLK_ATTR_ALIGNED(32) float min_values_buffer[8];
371  __VOLK_ATTR_ALIGNED(32) uint32_t min_indices_buffer[8];
372  _mm256_store_ps(min_values_buffer, min_values);
373  _mm256_store_si256((__m256i*)min_indices_buffer, min_indices);
374 
375  float min = FLT_MAX;
376  uint32_t index = 0;
377  for (unsigned i = 0; i < 8; i++) {
378  if (min_values_buffer[i] < min) {
379  min = min_values_buffer[i];
380  index = min_indices_buffer[i];
381  }
382  }
383 
384  // handle tail not processed by the vectorized loop
385  for (unsigned i = num_points & (~7u); i < num_points; ++i) {
386  const float abs_squared =
387  lv_creal(*source) * lv_creal(*source) + lv_cimag(*source) * lv_cimag(*source);
388  if (abs_squared < min) {
389  min = abs_squared;
390  index = i;
391  }
392  ++source;
393  }
394 
395  *target = index;
396 }
397 
398 #endif /*LV_HAVE_AVX2*/
399 
400 #ifdef LV_HAVE_AVX2
401 #include <immintrin.h>
403 
404 static inline void volk_32fc_index_min_32u_u_avx2_variant_1(uint32_t* target,
405  const lv_32fc_t* source,
406  uint32_t num_points)
407 {
408  const __m256i indices_increment = _mm256_set1_epi32(8);
409  /*
410  * At the start of each loop iteration current_indices holds the indices of
411  * the complex numbers loaded from memory. Explanation for odd order is given
412  * in implementation of vector_32fc_index_min_variant0().
413  */
414  __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
415 
416  __m256 min_values = _mm256_set1_ps(FLT_MAX);
417  __m256i min_indices = _mm256_setzero_si256();
418 
419  for (unsigned i = 0; i < num_points / 8u; ++i) {
420  __m256 in0 = _mm256_loadu_ps((float*)source);
421  __m256 in1 = _mm256_loadu_ps((float*)(source + 4));
423  in0, in1, &min_values, &min_indices, &current_indices, indices_increment);
424  source += 8;
425  }
426 
427  // determine minimum value and index in the result of the vectorized loop
428  __VOLK_ATTR_ALIGNED(32) float min_values_buffer[8];
429  __VOLK_ATTR_ALIGNED(32) uint32_t min_indices_buffer[8];
430  _mm256_store_ps(min_values_buffer, min_values);
431  _mm256_store_si256((__m256i*)min_indices_buffer, min_indices);
432 
433  float min = FLT_MAX;
434  uint32_t index = 0;
435  for (unsigned i = 0; i < 8; i++) {
436  if (min_values_buffer[i] < min) {
437  min = min_values_buffer[i];
438  index = min_indices_buffer[i];
439  }
440  }
441 
442  // handle tail not processed by the vectorized loop
443  for (unsigned i = num_points & (~7u); i < num_points; ++i) {
444  const float abs_squared =
445  lv_creal(*source) * lv_creal(*source) + lv_cimag(*source) * lv_cimag(*source);
446  if (abs_squared < min) {
447  min = abs_squared;
448  index = i;
449  }
450  ++source;
451  }
452 
453  *target = index;
454 }
455 
456 #endif /*LV_HAVE_AVX2*/
457 
458 #ifdef LV_HAVE_NEON
459 #include <arm_neon.h>
461 
462 static inline void volk_32fc_index_min_32u_neon(uint32_t* target,
463  const lv_32fc_t* source,
464  uint32_t num_points)
465 {
466  const uint32_t quarter_points = num_points / 4;
467  const lv_32fc_t* sourcePtr = source;
468 
469  uint32_t indices[4] = { 0, 1, 2, 3 };
470  const uint32x4_t vec_indices_incr = vdupq_n_u32(4);
471  uint32x4_t vec_indices = vld1q_u32(indices);
472  uint32x4_t vec_min_indices = vec_indices;
473 
474  if (num_points) {
475  float min = FLT_MAX;
476  uint32_t index = 0;
477 
478  float32x4_t vec_min = vdupq_n_f32(FLT_MAX);
479 
480  for (uint32_t number = 0; number < quarter_points; number++) {
481  // Load complex and compute magnitude squared
482  const float32x4_t vec_mag2 =
483  _vmagnitudesquaredq_f32(vld2q_f32((float*)sourcePtr));
484  __VOLK_PREFETCH(sourcePtr += 4);
485  // a < b?
486  const uint32x4_t lt_mask = vcltq_f32(vec_mag2, vec_min);
487  vec_min = vbslq_f32(lt_mask, vec_mag2, vec_min);
488  vec_min_indices = vbslq_u32(lt_mask, vec_indices, vec_min_indices);
489  vec_indices = vaddq_u32(vec_indices, vec_indices_incr);
490  }
491  uint32_t tmp_min_indices[4];
492  float tmp_min[4];
493  vst1q_u32(tmp_min_indices, vec_min_indices);
494  vst1q_f32(tmp_min, vec_min);
495 
496  for (int i = 0; i < 4; i++) {
497  if (tmp_min[i] < min) {
498  min = tmp_min[i];
499  index = tmp_min_indices[i];
500  }
501  }
502 
503  // Deal with the rest
504  for (uint32_t number = quarter_points * 4; number < num_points; number++) {
505  const float re = lv_creal(*sourcePtr);
506  const float im = lv_cimag(*sourcePtr);
507  if ((re * re + im * im) < min) {
508  min = *sourcePtr;
509  index = number;
510  }
511  sourcePtr++;
512  }
513  *target = index;
514  }
515 }
516 
517 #endif /*LV_HAVE_NEON*/
518 
519 #endif /*INCLUDED_volk_32fc_index_min_32u_u_H*/
Definition: volk_common.h:111
float f[4]
Definition: volk_common.h:115
__m128i int_vec
Definition: volk_common.h:123
uint32_t i[4]
Definition: volk_common.h:114
__m128 float_vec
Definition: volk_common.h:119
static void volk_32fc_index_min_32u_generic(uint32_t *target, const lv_32fc_t *source, uint32_t num_points)
Definition: volk_32fc_index_min_32u.h:310
static void volk_32fc_index_min_32u_a_sse3(uint32_t *target, const lv_32fc_t *source, uint32_t num_points)
Definition: volk_32fc_index_min_32u.h:198
static void volk_32fc_index_min_32u_neon(uint32_t *target, const lv_32fc_t *source, uint32_t num_points)
Definition: volk_32fc_index_min_32u.h:462
static void vector_32fc_index_min_variant0(__m256 in0, __m256 in1, __m256 *min_values, __m256i *min_indices, __m256i *current_indices, __m256i indices_increment)
Definition: volk_avx2_intrinsics.h:251
static void vector_32fc_index_min_variant1(__m256 in0, __m256 in1, __m256 *min_values, __m256i *min_indices, __m256i *current_indices, __m256i indices_increment)
Definition: volk_avx2_intrinsics.h:313
#define bit128_p(x)
Definition: volk_common.h:142
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:62
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:56
#define lv_cimag(x)
Definition: volk_complex.h:89
#define lv_creal(x)
Definition: volk_complex.h:87
float complex lv_32fc_t
Definition: volk_complex.h:65
for i
Definition: volk_config_fixed.tmpl.h:25
static float32x4_t _vmagnitudesquaredq_f32(float32x4x2_t cmplxValue)
Definition: volk_neon_intrinsics.h:87