Vector Optimized Library of Kernels  2.5.1
Architecture-tuned implementations of math kernels
volk_32fc_index_min_16u.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2021 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * VOLK is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * VOLK is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
76 #ifndef INCLUDED_volk_32fc_index_min_16u_a_H
77 #define INCLUDED_volk_32fc_index_min_16u_a_H
78 
79 #include <inttypes.h>
80 #include <limits.h>
81 #include <stdio.h>
82 #include <volk/volk_common.h>
83 #include <volk/volk_complex.h>
84 
85 #ifdef LV_HAVE_AVX2
86 #include <immintrin.h>
88 
89 static inline void volk_32fc_index_min_16u_a_avx2_variant_0(uint16_t* target,
90  const lv_32fc_t* source,
91  uint32_t num_points)
92 {
93  num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
94 
95  const __m256i indices_increment = _mm256_set1_epi32(8);
96  /*
97  * At the start of each loop iteration current_indices holds the indices of
98  * the complex numbers loaded from memory. Explanation for odd order is given
99  * in implementation of vector_32fc_index_min_variant0().
100  */
101  __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
102 
103  __m256 min_values = _mm256_set1_ps(FLT_MAX);
104  __m256i min_indices = _mm256_setzero_si256();
105 
106  for (unsigned i = 0; i < num_points / 8u; ++i) {
107  __m256 in0 = _mm256_load_ps((float*)source);
108  __m256 in1 = _mm256_load_ps((float*)(source + 4));
110  in0, in1, &min_values, &min_indices, &current_indices, indices_increment);
111  source += 8;
112  }
113 
114  // determine minimum value and index in the result of the vectorized loop
115  __VOLK_ATTR_ALIGNED(32) float min_values_buffer[8];
116  __VOLK_ATTR_ALIGNED(32) uint32_t min_indices_buffer[8];
117  _mm256_store_ps(min_values_buffer, min_values);
118  _mm256_store_si256((__m256i*)min_indices_buffer, min_indices);
119 
120  float min = FLT_MAX;
121  uint32_t index = 0;
122  for (unsigned i = 0; i < 8; i++) {
123  if (min_values_buffer[i] < min) {
124  min = min_values_buffer[i];
125  index = min_indices_buffer[i];
126  }
127  }
128 
129  // handle tail not processed by the vectorized loop
130  for (unsigned i = num_points & (~7u); i < num_points; ++i) {
131  const float abs_squared =
132  lv_creal(*source) * lv_creal(*source) + lv_cimag(*source) * lv_cimag(*source);
133  if (abs_squared < min) {
134  min = abs_squared;
135  index = i;
136  }
137  ++source;
138  }
139 
140  *target = index;
141 }
142 
143 #endif /*LV_HAVE_AVX2*/
144 
145 #ifdef LV_HAVE_AVX2
146 #include <immintrin.h>
148 
149 static inline void volk_32fc_index_min_16u_a_avx2_variant_1(uint16_t* target,
150  const lv_32fc_t* source,
151  uint32_t num_points)
152 {
153  num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
154 
155  const __m256i indices_increment = _mm256_set1_epi32(8);
156  /*
157  * At the start of each loop iteration current_indices holds the indices of
158  * the complex numbers loaded from memory. Explanation for odd order is given
159  * in implementation of vector_32fc_index_min_variant0().
160  */
161  __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
162 
163  __m256 min_values = _mm256_set1_ps(FLT_MAX);
164  __m256i min_indices = _mm256_setzero_si256();
165 
166  for (unsigned i = 0; i < num_points / 8u; ++i) {
167  __m256 in0 = _mm256_load_ps((float*)source);
168  __m256 in1 = _mm256_load_ps((float*)(source + 4));
170  in0, in1, &min_values, &min_indices, &current_indices, indices_increment);
171  source += 8;
172  }
173 
174  // determine minimum value and index in the result of the vectorized loop
175  __VOLK_ATTR_ALIGNED(32) float min_values_buffer[8];
176  __VOLK_ATTR_ALIGNED(32) uint32_t min_indices_buffer[8];
177  _mm256_store_ps(min_values_buffer, min_values);
178  _mm256_store_si256((__m256i*)min_indices_buffer, min_indices);
179 
180  float min = FLT_MAX;
181  uint32_t index = 0;
182  for (unsigned i = 0; i < 8; i++) {
183  if (min_values_buffer[i] < min) {
184  min = min_values_buffer[i];
185  index = min_indices_buffer[i];
186  }
187  }
188 
189  // handle tail not processed by the vectorized loop
190  for (unsigned i = num_points & (~7u); i < num_points; ++i) {
191  const float abs_squared =
192  lv_creal(*source) * lv_creal(*source) + lv_cimag(*source) * lv_cimag(*source);
193  if (abs_squared < min) {
194  min = abs_squared;
195  index = i;
196  }
197  ++source;
198  }
199 
200  *target = index;
201 }
202 
203 #endif /*LV_HAVE_AVX2*/
204 
205 #ifdef LV_HAVE_SSE3
206 #include <pmmintrin.h>
207 #include <xmmintrin.h>
208 
209 static inline void volk_32fc_index_min_16u_a_sse3(uint16_t* target,
210  const lv_32fc_t* source,
211  uint32_t num_points)
212 {
213  num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
214 
215  union bit128 holderf;
216  union bit128 holderi;
217  float sq_dist = 0.0;
218 
219  union bit128 xmm5, xmm4;
220  __m128 xmm1, xmm2, xmm3;
221  __m128i xmm8, xmm11, xmm12, xmm9, xmm10;
222 
223  xmm5.int_vec = _mm_setzero_si128();
224  xmm4.int_vec = _mm_setzero_si128();
225  holderf.int_vec = _mm_setzero_si128();
226  holderi.int_vec = _mm_setzero_si128();
227 
228  xmm8 = _mm_setr_epi32(0, 1, 2, 3);
229  xmm9 = _mm_setzero_si128();
230  xmm10 = _mm_setr_epi32(4, 4, 4, 4);
231  xmm3 = _mm_set_ps1(FLT_MAX);
232 
233  int bound = num_points >> 2;
234 
235  for (int i = 0; i < bound; ++i) {
236  xmm1 = _mm_load_ps((float*)source);
237  xmm2 = _mm_load_ps((float*)&source[2]);
238 
239  source += 4;
240 
241  xmm1 = _mm_mul_ps(xmm1, xmm1);
242  xmm2 = _mm_mul_ps(xmm2, xmm2);
243 
244  xmm1 = _mm_hadd_ps(xmm1, xmm2);
245 
246  xmm3 = _mm_min_ps(xmm1, xmm3);
247 
248  xmm4.float_vec = _mm_cmpgt_ps(xmm1, xmm3);
249  xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
250 
251  xmm11 = _mm_and_si128(xmm8, xmm5.int_vec);
252  xmm12 = _mm_and_si128(xmm9, xmm4.int_vec);
253 
254  xmm9 = _mm_add_epi32(xmm11, xmm12);
255 
256  xmm8 = _mm_add_epi32(xmm8, xmm10);
257  }
258 
259  if (num_points >> 1 & 1) {
260  xmm2 = _mm_load_ps((float*)source);
261 
262  xmm1 = _mm_movelh_ps(bit128_p(&xmm8)->float_vec, bit128_p(&xmm8)->float_vec);
263  xmm8 = bit128_p(&xmm1)->int_vec;
264 
265  xmm2 = _mm_mul_ps(xmm2, xmm2);
266 
267  source += 2;
268 
269  xmm1 = _mm_hadd_ps(xmm2, xmm2);
270 
271  xmm3 = _mm_min_ps(xmm1, xmm3);
272 
273  xmm10 = _mm_setr_epi32(2, 2, 2, 2);
274 
275  xmm4.float_vec = _mm_cmpgt_ps(xmm1, xmm3);
276  xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
277 
278  xmm11 = _mm_and_si128(xmm8, xmm5.int_vec);
279  xmm12 = _mm_and_si128(xmm9, xmm4.int_vec);
280 
281  xmm9 = _mm_add_epi32(xmm11, xmm12);
282 
283  xmm8 = _mm_add_epi32(xmm8, xmm10);
284  }
285 
286  if (num_points & 1) {
287  sq_dist = lv_creal(source[0]) * lv_creal(source[0]) +
288  lv_cimag(source[0]) * lv_cimag(source[0]);
289 
290  xmm2 = _mm_load1_ps(&sq_dist);
291 
292  xmm1 = xmm3;
293 
294  xmm3 = _mm_min_ss(xmm3, xmm2);
295 
296  xmm4.float_vec = _mm_cmpgt_ps(xmm1, xmm3);
297  xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
298 
299  xmm8 = _mm_shuffle_epi32(xmm8, 0x00);
300 
301  xmm11 = _mm_and_si128(xmm8, xmm4.int_vec);
302  xmm12 = _mm_and_si128(xmm9, xmm5.int_vec);
303 
304  xmm9 = _mm_add_epi32(xmm11, xmm12);
305  }
306 
307  _mm_store_ps((float*)&(holderf.f), xmm3);
308  _mm_store_si128(&(holderi.int_vec), xmm9);
309 
310  target[0] = holderi.i[0];
311  sq_dist = holderf.f[0];
312  target[0] = (holderf.f[1] < sq_dist) ? holderi.i[1] : target[0];
313  sq_dist = (holderf.f[1] < sq_dist) ? holderf.f[1] : sq_dist;
314  target[0] = (holderf.f[2] < sq_dist) ? holderi.i[2] : target[0];
315  sq_dist = (holderf.f[2] < sq_dist) ? holderf.f[2] : sq_dist;
316  target[0] = (holderf.f[3] < sq_dist) ? holderi.i[3] : target[0];
317  sq_dist = (holderf.f[3] < sq_dist) ? holderf.f[3] : sq_dist;
318 }
319 
320 #endif /*LV_HAVE_SSE3*/
321 
322 #ifdef LV_HAVE_GENERIC
323 static inline void volk_32fc_index_min_16u_generic(uint16_t* target,
324  const lv_32fc_t* source,
325  uint32_t num_points)
326 {
327  num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
328 
329  float sq_dist = 0.0;
330  float min = FLT_MAX;
331  uint16_t index = 0;
332 
333  for (uint32_t i = 0; i < num_points; ++i) {
334  sq_dist = lv_creal(source[i]) * lv_creal(source[i]) +
335  lv_cimag(source[i]) * lv_cimag(source[i]);
336 
337  if (sq_dist < min) {
338  index = i;
339  min = sq_dist;
340  }
341  }
342  target[0] = index;
343 }
344 
345 #endif /*LV_HAVE_GENERIC*/
346 
347 #endif /*INCLUDED_volk_32fc_index_min_16u_a_H*/
348 
349 #ifndef INCLUDED_volk_32fc_index_min_16u_u_H
350 #define INCLUDED_volk_32fc_index_min_16u_u_H
351 
352 #include <inttypes.h>
353 #include <limits.h>
354 #include <stdio.h>
355 #include <volk/volk_common.h>
356 #include <volk/volk_complex.h>
357 
358 #ifdef LV_HAVE_AVX2
359 #include <immintrin.h>
361 
362 static inline void volk_32fc_index_min_16u_u_avx2_variant_0(uint16_t* target,
363  const lv_32fc_t* source,
364  uint32_t num_points)
365 {
366  num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
367 
368  const __m256i indices_increment = _mm256_set1_epi32(8);
369  /*
370  * At the start of each loop iteration current_indices holds the indices of
371  * the complex numbers loaded from memory. Explanation for odd order is given
372  * in implementation of vector_32fc_index_min_variant0().
373  */
374  __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
375 
376  __m256 min_values = _mm256_set1_ps(FLT_MAX);
377  __m256i min_indices = _mm256_setzero_si256();
378 
379  for (unsigned i = 0; i < num_points / 8u; ++i) {
380  __m256 in0 = _mm256_loadu_ps((float*)source);
381  __m256 in1 = _mm256_loadu_ps((float*)(source + 4));
383  in0, in1, &min_values, &min_indices, &current_indices, indices_increment);
384  source += 8;
385  }
386 
387  // determine minimum value and index in the result of the vectorized loop
388  __VOLK_ATTR_ALIGNED(32) float min_values_buffer[8];
389  __VOLK_ATTR_ALIGNED(32) uint32_t min_indices_buffer[8];
390  _mm256_store_ps(min_values_buffer, min_values);
391  _mm256_store_si256((__m256i*)min_indices_buffer, min_indices);
392 
393  float min = FLT_MAX;
394  uint32_t index = 0;
395  for (unsigned i = 0; i < 8; i++) {
396  if (min_values_buffer[i] < min) {
397  min = min_values_buffer[i];
398  index = min_indices_buffer[i];
399  }
400  }
401 
402  // handle tail not processed by the vectorized loop
403  for (unsigned i = num_points & (~7u); i < num_points; ++i) {
404  const float abs_squared =
405  lv_creal(*source) * lv_creal(*source) + lv_cimag(*source) * lv_cimag(*source);
406  if (abs_squared < min) {
407  min = abs_squared;
408  index = i;
409  }
410  ++source;
411  }
412 
413  *target = index;
414 }
415 
416 #endif /*LV_HAVE_AVX2*/
417 
418 #ifdef LV_HAVE_AVX2
419 #include <immintrin.h>
421 
422 static inline void volk_32fc_index_min_16u_u_avx2_variant_1(uint16_t* target,
423  const lv_32fc_t* source,
424  uint32_t num_points)
425 {
426  num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
427 
428  const __m256i indices_increment = _mm256_set1_epi32(8);
429  /*
430  * At the start of each loop iteration current_indices holds the indices of
431  * the complex numbers loaded from memory. Explanation for odd order is given
432  * in implementation of vector_32fc_index_min_variant0().
433  */
434  __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
435 
436  __m256 min_values = _mm256_set1_ps(FLT_MAX);
437  __m256i min_indices = _mm256_setzero_si256();
438 
439  for (unsigned i = 0; i < num_points / 8u; ++i) {
440  __m256 in0 = _mm256_loadu_ps((float*)source);
441  __m256 in1 = _mm256_loadu_ps((float*)(source + 4));
443  in0, in1, &min_values, &min_indices, &current_indices, indices_increment);
444  source += 8;
445  }
446 
447  // determine minimum value and index in the result of the vectorized loop
448  __VOLK_ATTR_ALIGNED(32) float min_values_buffer[8];
449  __VOLK_ATTR_ALIGNED(32) uint32_t min_indices_buffer[8];
450  _mm256_store_ps(min_values_buffer, min_values);
451  _mm256_store_si256((__m256i*)min_indices_buffer, min_indices);
452 
453  float min = FLT_MAX;
454  uint32_t index = 0;
455  for (unsigned i = 0; i < 8; i++) {
456  if (min_values_buffer[i] < min) {
457  min = min_values_buffer[i];
458  index = min_indices_buffer[i];
459  }
460  }
461 
462  // handle tail not processed by the vectorized loop
463  for (unsigned i = num_points & (~7u); i < num_points; ++i) {
464  const float abs_squared =
465  lv_creal(*source) * lv_creal(*source) + lv_cimag(*source) * lv_cimag(*source);
466  if (abs_squared < min) {
467  min = abs_squared;
468  index = i;
469  }
470  ++source;
471  }
472 
473  *target = index;
474 }
475 
476 #endif /*LV_HAVE_AVX2*/
477 
478 #endif /*INCLUDED_volk_32fc_index_min_16u_u_H*/
Definition: volk_common.h:111
float f[4]
Definition: volk_common.h:115
__m128i int_vec
Definition: volk_common.h:123
uint32_t i[4]
Definition: volk_common.h:114
__m128 float_vec
Definition: volk_common.h:119
static void volk_32fc_index_min_16u_generic(uint16_t *target, const lv_32fc_t *source, uint32_t num_points)
Definition: volk_32fc_index_min_16u.h:323
static void volk_32fc_index_min_16u_a_sse3(uint16_t *target, const lv_32fc_t *source, uint32_t num_points)
Definition: volk_32fc_index_min_16u.h:209
static void vector_32fc_index_min_variant0(__m256 in0, __m256 in1, __m256 *min_values, __m256i *min_indices, __m256i *current_indices, __m256i indices_increment)
Definition: volk_avx2_intrinsics.h:251
static void vector_32fc_index_min_variant1(__m256 in0, __m256 in1, __m256 *min_values, __m256i *min_indices, __m256i *current_indices, __m256i indices_increment)
Definition: volk_avx2_intrinsics.h:313
#define bit128_p(x)
Definition: volk_common.h:142
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:56
#define lv_cimag(x)
Definition: volk_complex.h:89
#define lv_creal(x)
Definition: volk_complex.h:87
float complex lv_32fc_t
Definition: volk_complex.h:65
for i
Definition: volk_config_fixed.tmpl.h:25