76 #ifndef INCLUDED_volk_32fc_index_min_16u_a_H
77 #define INCLUDED_volk_32fc_index_min_16u_a_H
86 #include <immintrin.h>
89 static inline void volk_32fc_index_min_16u_a_avx2_variant_0(uint16_t* target,
93 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
95 const __m256i indices_increment = _mm256_set1_epi32(8);
101 __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
103 __m256 min_values = _mm256_set1_ps(FLT_MAX);
104 __m256i min_indices = _mm256_setzero_si256();
106 for (
unsigned i = 0;
i < num_points / 8u; ++
i) {
107 __m256 in0 = _mm256_load_ps((
float*)source);
108 __m256 in1 = _mm256_load_ps((
float*)(source + 4));
110 in0, in1, &min_values, &min_indices, ¤t_indices, indices_increment);
117 _mm256_store_ps(min_values_buffer, min_values);
118 _mm256_store_si256((__m256i*)min_indices_buffer, min_indices);
122 for (
unsigned i = 0;
i < 8;
i++) {
123 if (min_values_buffer[
i] < min) {
124 min = min_values_buffer[
i];
125 index = min_indices_buffer[
i];
130 for (
unsigned i = num_points & (~7u);
i < num_points; ++
i) {
131 const float abs_squared =
133 if (abs_squared < min) {
146 #include <immintrin.h>
149 static inline void volk_32fc_index_min_16u_a_avx2_variant_1(uint16_t* target,
153 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
155 const __m256i indices_increment = _mm256_set1_epi32(8);
161 __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
163 __m256 min_values = _mm256_set1_ps(FLT_MAX);
164 __m256i min_indices = _mm256_setzero_si256();
166 for (
unsigned i = 0;
i < num_points / 8u; ++
i) {
167 __m256 in0 = _mm256_load_ps((
float*)source);
168 __m256 in1 = _mm256_load_ps((
float*)(source + 4));
170 in0, in1, &min_values, &min_indices, ¤t_indices, indices_increment);
177 _mm256_store_ps(min_values_buffer, min_values);
178 _mm256_store_si256((__m256i*)min_indices_buffer, min_indices);
182 for (
unsigned i = 0;
i < 8;
i++) {
183 if (min_values_buffer[
i] < min) {
184 min = min_values_buffer[
i];
185 index = min_indices_buffer[
i];
190 for (
unsigned i = num_points & (~7u);
i < num_points; ++
i) {
191 const float abs_squared =
193 if (abs_squared < min) {
206 #include <pmmintrin.h>
207 #include <xmmintrin.h>
213 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
220 __m128 xmm1, xmm2, xmm3;
221 __m128i xmm8, xmm11, xmm12, xmm9, xmm10;
223 xmm5.
int_vec = _mm_setzero_si128();
224 xmm4.
int_vec = _mm_setzero_si128();
225 holderf.
int_vec = _mm_setzero_si128();
226 holderi.
int_vec = _mm_setzero_si128();
228 xmm8 = _mm_setr_epi32(0, 1, 2, 3);
229 xmm9 = _mm_setzero_si128();
230 xmm10 = _mm_setr_epi32(4, 4, 4, 4);
231 xmm3 = _mm_set_ps1(FLT_MAX);
233 int bound = num_points >> 2;
235 for (
int i = 0;
i < bound; ++
i) {
236 xmm1 = _mm_load_ps((
float*)source);
237 xmm2 = _mm_load_ps((
float*)&source[2]);
241 xmm1 = _mm_mul_ps(xmm1, xmm1);
242 xmm2 = _mm_mul_ps(xmm2, xmm2);
244 xmm1 = _mm_hadd_ps(xmm1, xmm2);
246 xmm3 = _mm_min_ps(xmm1, xmm3);
248 xmm4.
float_vec = _mm_cmpgt_ps(xmm1, xmm3);
249 xmm5.
float_vec = _mm_cmpeq_ps(xmm1, xmm3);
251 xmm11 = _mm_and_si128(xmm8, xmm5.
int_vec);
252 xmm12 = _mm_and_si128(xmm9, xmm4.
int_vec);
254 xmm9 = _mm_add_epi32(xmm11, xmm12);
256 xmm8 = _mm_add_epi32(xmm8, xmm10);
259 if (num_points >> 1 & 1) {
260 xmm2 = _mm_load_ps((
float*)source);
265 xmm2 = _mm_mul_ps(xmm2, xmm2);
269 xmm1 = _mm_hadd_ps(xmm2, xmm2);
271 xmm3 = _mm_min_ps(xmm1, xmm3);
273 xmm10 = _mm_setr_epi32(2, 2, 2, 2);
275 xmm4.
float_vec = _mm_cmpgt_ps(xmm1, xmm3);
276 xmm5.
float_vec = _mm_cmpeq_ps(xmm1, xmm3);
278 xmm11 = _mm_and_si128(xmm8, xmm5.
int_vec);
279 xmm12 = _mm_and_si128(xmm9, xmm4.
int_vec);
281 xmm9 = _mm_add_epi32(xmm11, xmm12);
283 xmm8 = _mm_add_epi32(xmm8, xmm10);
286 if (num_points & 1) {
290 xmm2 = _mm_load1_ps(&sq_dist);
294 xmm3 = _mm_min_ss(xmm3, xmm2);
296 xmm4.
float_vec = _mm_cmpgt_ps(xmm1, xmm3);
297 xmm5.
float_vec = _mm_cmpeq_ps(xmm1, xmm3);
299 xmm8 = _mm_shuffle_epi32(xmm8, 0x00);
301 xmm11 = _mm_and_si128(xmm8, xmm4.
int_vec);
302 xmm12 = _mm_and_si128(xmm9, xmm5.
int_vec);
304 xmm9 = _mm_add_epi32(xmm11, xmm12);
307 _mm_store_ps((
float*)&(holderf.
f), xmm3);
308 _mm_store_si128(&(holderi.
int_vec), xmm9);
310 target[0] = holderi.
i[0];
311 sq_dist = holderf.
f[0];
312 target[0] = (holderf.
f[1] < sq_dist) ? holderi.
i[1] : target[0];
313 sq_dist = (holderf.
f[1] < sq_dist) ? holderf.
f[1] : sq_dist;
314 target[0] = (holderf.
f[2] < sq_dist) ? holderi.
i[2] : target[0];
315 sq_dist = (holderf.
f[2] < sq_dist) ? holderf.
f[2] : sq_dist;
316 target[0] = (holderf.
f[3] < sq_dist) ? holderi.
i[3] : target[0];
317 sq_dist = (holderf.
f[3] < sq_dist) ? holderf.
f[3] : sq_dist;
322 #ifdef LV_HAVE_GENERIC
327 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
333 for (uint32_t
i = 0;
i < num_points; ++
i) {
349 #ifndef INCLUDED_volk_32fc_index_min_16u_u_H
350 #define INCLUDED_volk_32fc_index_min_16u_u_H
352 #include <inttypes.h>
359 #include <immintrin.h>
362 static inline void volk_32fc_index_min_16u_u_avx2_variant_0(uint16_t* target,
366 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
368 const __m256i indices_increment = _mm256_set1_epi32(8);
374 __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
376 __m256 min_values = _mm256_set1_ps(FLT_MAX);
377 __m256i min_indices = _mm256_setzero_si256();
379 for (
unsigned i = 0;
i < num_points / 8u; ++
i) {
380 __m256 in0 = _mm256_loadu_ps((
float*)source);
381 __m256 in1 = _mm256_loadu_ps((
float*)(source + 4));
383 in0, in1, &min_values, &min_indices, ¤t_indices, indices_increment);
390 _mm256_store_ps(min_values_buffer, min_values);
391 _mm256_store_si256((__m256i*)min_indices_buffer, min_indices);
395 for (
unsigned i = 0;
i < 8;
i++) {
396 if (min_values_buffer[
i] < min) {
397 min = min_values_buffer[
i];
398 index = min_indices_buffer[
i];
403 for (
unsigned i = num_points & (~7u);
i < num_points; ++
i) {
404 const float abs_squared =
406 if (abs_squared < min) {
419 #include <immintrin.h>
422 static inline void volk_32fc_index_min_16u_u_avx2_variant_1(uint16_t* target,
426 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
428 const __m256i indices_increment = _mm256_set1_epi32(8);
434 __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
436 __m256 min_values = _mm256_set1_ps(FLT_MAX);
437 __m256i min_indices = _mm256_setzero_si256();
439 for (
unsigned i = 0;
i < num_points / 8u; ++
i) {
440 __m256 in0 = _mm256_loadu_ps((
float*)source);
441 __m256 in1 = _mm256_loadu_ps((
float*)(source + 4));
443 in0, in1, &min_values, &min_indices, ¤t_indices, indices_increment);
450 _mm256_store_ps(min_values_buffer, min_values);
451 _mm256_store_si256((__m256i*)min_indices_buffer, min_indices);
455 for (
unsigned i = 0;
i < 8;
i++) {
456 if (min_values_buffer[
i] < min) {
457 min = min_values_buffer[
i];
458 index = min_indices_buffer[
i];
463 for (
unsigned i = num_points & (~7u);
i < num_points; ++
i) {
464 const float abs_squared =
466 if (abs_squared < min) {
Definition: volk_common.h:111
float f[4]
Definition: volk_common.h:115
__m128i int_vec
Definition: volk_common.h:123
uint32_t i[4]
Definition: volk_common.h:114
__m128 float_vec
Definition: volk_common.h:119
static void volk_32fc_index_min_16u_generic(uint16_t *target, const lv_32fc_t *source, uint32_t num_points)
Definition: volk_32fc_index_min_16u.h:323
static void volk_32fc_index_min_16u_a_sse3(uint16_t *target, const lv_32fc_t *source, uint32_t num_points)
Definition: volk_32fc_index_min_16u.h:209
static void vector_32fc_index_min_variant0(__m256 in0, __m256 in1, __m256 *min_values, __m256i *min_indices, __m256i *current_indices, __m256i indices_increment)
Definition: volk_avx2_intrinsics.h:251
static void vector_32fc_index_min_variant1(__m256 in0, __m256 in1, __m256 *min_values, __m256i *min_indices, __m256i *current_indices, __m256i indices_increment)
Definition: volk_avx2_intrinsics.h:313
#define bit128_p(x)
Definition: volk_common.h:142
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:56
#define lv_cimag(x)
Definition: volk_complex.h:89
#define lv_creal(x)
Definition: volk_complex.h:87
float complex lv_32fc_t
Definition: volk_complex.h:65
for i
Definition: volk_config_fixed.tmpl.h:25