70 #ifndef INCLUDED_volk_32fc_index_max_32u_a_H
71 #define INCLUDED_volk_32fc_index_max_32u_a_H
79 #include <immintrin.h>
82 static inline void volk_32fc_index_max_32u_a_avx2_variant_0(uint32_t* target,
86 const __m256i indices_increment = _mm256_set1_epi32(8);
92 __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
94 __m256 max_values = _mm256_setzero_ps();
95 __m256i max_indices = _mm256_setzero_si256();
97 for (
unsigned i = 0;
i < num_points / 8u; ++
i) {
98 __m256 in0 = _mm256_load_ps((
float*)src0);
99 __m256 in1 = _mm256_load_ps((
float*)(src0 + 4));
101 in0, in1, &max_values, &max_indices, ¤t_indices, indices_increment);
108 _mm256_store_ps(max_values_buffer, max_values);
109 _mm256_store_si256((__m256i*)max_indices_buffer, max_indices);
113 for (
unsigned i = 0;
i < 8;
i++) {
114 if (max_values_buffer[
i] > max) {
115 max = max_values_buffer[
i];
116 index = max_indices_buffer[
i];
121 for (
unsigned i = num_points & (~7u);
i < num_points; ++
i) {
122 const float abs_squared =
124 if (abs_squared > max) {
137 #include <immintrin.h>
140 static inline void volk_32fc_index_max_32u_a_avx2_variant_1(uint32_t* target,
144 const __m256i indices_increment = _mm256_set1_epi32(8);
150 __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
152 __m256 max_values = _mm256_setzero_ps();
153 __m256i max_indices = _mm256_setzero_si256();
155 for (
unsigned i = 0;
i < num_points / 8u; ++
i) {
156 __m256 in0 = _mm256_load_ps((
float*)src0);
157 __m256 in1 = _mm256_load_ps((
float*)(src0 + 4));
159 in0, in1, &max_values, &max_indices, ¤t_indices, indices_increment);
166 _mm256_store_ps(max_values_buffer, max_values);
167 _mm256_store_si256((__m256i*)max_indices_buffer, max_indices);
171 for (
unsigned i = 0;
i < 8;
i++) {
172 if (max_values_buffer[
i] > max) {
173 max = max_values_buffer[
i];
174 index = max_indices_buffer[
i];
179 for (
unsigned i = num_points & (~7u);
i < num_points; ++
i) {
180 const float abs_squared =
182 if (abs_squared > max) {
195 #include <pmmintrin.h>
196 #include <xmmintrin.h>
201 const uint32_t num_bytes = num_points * 8;
208 __m128 xmm1, xmm2, xmm3;
209 __m128i xmm8, xmm11, xmm12, xmm9, xmm10;
211 xmm5.
int_vec = _mm_setzero_si128();
212 xmm4.
int_vec = _mm_setzero_si128();
213 holderf.
int_vec = _mm_setzero_si128();
214 holderi.
int_vec = _mm_setzero_si128();
216 int bound = num_bytes >> 5;
219 xmm8 = _mm_setr_epi32(0, 1, 2, 3);
220 xmm9 = _mm_setzero_si128();
221 xmm10 = _mm_setr_epi32(4, 4, 4, 4);
222 xmm3 = _mm_setzero_ps();
224 for (;
i < bound; ++
i) {
225 xmm1 = _mm_load_ps((
float*)src0);
226 xmm2 = _mm_load_ps((
float*)&src0[2]);
230 xmm1 = _mm_mul_ps(xmm1, xmm1);
231 xmm2 = _mm_mul_ps(xmm2, xmm2);
233 xmm1 = _mm_hadd_ps(xmm1, xmm2);
235 xmm3 = _mm_max_ps(xmm1, xmm3);
237 xmm4.
float_vec = _mm_cmplt_ps(xmm1, xmm3);
238 xmm5.
float_vec = _mm_cmpeq_ps(xmm1, xmm3);
240 xmm11 = _mm_and_si128(xmm8, xmm5.
int_vec);
241 xmm12 = _mm_and_si128(xmm9, xmm4.
int_vec);
243 xmm9 = _mm_add_epi32(xmm11, xmm12);
245 xmm8 = _mm_add_epi32(xmm8, xmm10);
248 if (num_bytes >> 4 & 1) {
249 xmm2 = _mm_load_ps((
float*)src0);
254 xmm2 = _mm_mul_ps(xmm2, xmm2);
258 xmm1 = _mm_hadd_ps(xmm2, xmm2);
260 xmm3 = _mm_max_ps(xmm1, xmm3);
262 xmm10 = _mm_setr_epi32(2, 2, 2, 2);
264 xmm4.
float_vec = _mm_cmplt_ps(xmm1, xmm3);
265 xmm5.
float_vec = _mm_cmpeq_ps(xmm1, xmm3);
267 xmm11 = _mm_and_si128(xmm8, xmm5.
int_vec);
268 xmm12 = _mm_and_si128(xmm9, xmm4.
int_vec);
270 xmm9 = _mm_add_epi32(xmm11, xmm12);
272 xmm8 = _mm_add_epi32(xmm8, xmm10);
275 if (num_bytes >> 3 & 1) {
279 xmm2 = _mm_load1_ps(&sq_dist);
283 xmm3 = _mm_max_ss(xmm3, xmm2);
285 xmm4.
float_vec = _mm_cmplt_ps(xmm1, xmm3);
286 xmm5.
float_vec = _mm_cmpeq_ps(xmm1, xmm3);
288 xmm8 = _mm_shuffle_epi32(xmm8, 0x00);
290 xmm11 = _mm_and_si128(xmm8, xmm4.
int_vec);
291 xmm12 = _mm_and_si128(xmm9, xmm5.
int_vec);
293 xmm9 = _mm_add_epi32(xmm11, xmm12);
296 _mm_store_ps((
float*)&(holderf.
f), xmm3);
297 _mm_store_si128(&(holderi.
int_vec), xmm9);
299 target[0] = holderi.
i[0];
300 sq_dist = holderf.
f[0];
301 target[0] = (holderf.
f[1] > sq_dist) ? holderi.
i[1] : target[0];
302 sq_dist = (holderf.
f[1] > sq_dist) ? holderf.
f[1] : sq_dist;
303 target[0] = (holderf.
f[2] > sq_dist) ? holderi.
i[2] : target[0];
304 sq_dist = (holderf.
f[2] > sq_dist) ? holderf.
f[2] : sq_dist;
305 target[0] = (holderf.
f[3] > sq_dist) ? holderi.
i[3] : target[0];
306 sq_dist = (holderf.
f[3] > sq_dist) ? holderf.
f[3] : sq_dist;
311 #ifdef LV_HAVE_GENERIC
315 const uint32_t num_bytes = num_points * 8;
323 for (; i<num_bytes>> 3; ++
i) {
339 #ifndef INCLUDED_volk_32fc_index_max_32u_u_H
340 #define INCLUDED_volk_32fc_index_max_32u_u_H
342 #include <inttypes.h>
348 #include <immintrin.h>
351 static inline void volk_32fc_index_max_32u_u_avx2_variant_0(uint32_t* target,
355 const __m256i indices_increment = _mm256_set1_epi32(8);
361 __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
363 __m256 max_values = _mm256_setzero_ps();
364 __m256i max_indices = _mm256_setzero_si256();
366 for (
unsigned i = 0;
i < num_points / 8u; ++
i) {
367 __m256 in0 = _mm256_loadu_ps((
float*)src0);
368 __m256 in1 = _mm256_loadu_ps((
float*)(src0 + 4));
370 in0, in1, &max_values, &max_indices, ¤t_indices, indices_increment);
377 _mm256_store_ps(max_values_buffer, max_values);
378 _mm256_store_si256((__m256i*)max_indices_buffer, max_indices);
382 for (
unsigned i = 0;
i < 8;
i++) {
383 if (max_values_buffer[
i] > max) {
384 max = max_values_buffer[
i];
385 index = max_indices_buffer[
i];
390 for (
unsigned i = num_points & (~7u);
i < num_points; ++
i) {
391 const float abs_squared =
393 if (abs_squared > max) {
406 #include <immintrin.h>
409 static inline void volk_32fc_index_max_32u_u_avx2_variant_1(uint32_t* target,
413 const __m256i indices_increment = _mm256_set1_epi32(8);
419 __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
421 __m256 max_values = _mm256_setzero_ps();
422 __m256i max_indices = _mm256_setzero_si256();
424 for (
unsigned i = 0;
i < num_points / 8u; ++
i) {
425 __m256 in0 = _mm256_loadu_ps((
float*)src0);
426 __m256 in1 = _mm256_loadu_ps((
float*)(src0 + 4));
428 in0, in1, &max_values, &max_indices, ¤t_indices, indices_increment);
435 _mm256_store_ps(max_values_buffer, max_values);
436 _mm256_store_si256((__m256i*)max_indices_buffer, max_indices);
440 for (
unsigned i = 0;
i < 8;
i++) {
441 if (max_values_buffer[
i] > max) {
442 max = max_values_buffer[
i];
443 index = max_indices_buffer[
i];
448 for (
unsigned i = num_points & (~7u);
i < num_points; ++
i) {
449 const float abs_squared =
451 if (abs_squared > max) {
464 #include <arm_neon.h>
470 unsigned int number = 0;
471 const uint32_t quarter_points = num_points / 4;
474 uint32_t indices[4] = { 0, 1, 2, 3 };
475 const uint32x4_t vec_indices_incr = vdupq_n_u32(4);
476 uint32x4_t vec_indices = vld1q_u32(indices);
477 uint32x4_t vec_max_indices = vec_indices;
480 float max = *src0Ptr;
483 float32x4_t vec_max = vdupq_n_f32(*src0Ptr);
485 for (; number < quarter_points; number++) {
487 const float32x4_t vec_mag2 =
491 const uint32x4_t gt_mask = vcgtq_f32(vec_mag2, vec_max);
492 vec_max = vbslq_f32(gt_mask, vec_mag2, vec_max);
493 vec_max_indices = vbslq_u32(gt_mask, vec_indices, vec_max_indices);
494 vec_indices = vaddq_u32(vec_indices, vec_indices_incr);
496 uint32_t tmp_max_indices[4];
498 vst1q_u32(tmp_max_indices, vec_max_indices);
499 vst1q_f32(tmp_max, vec_max);
501 for (
int i = 0;
i < 4;
i++) {
502 if (tmp_max[
i] > max) {
504 index = tmp_max_indices[
i];
509 for (number = quarter_points * 4; number < num_points; number++) {
510 const float re =
lv_creal(*src0Ptr);
511 const float im =
lv_cimag(*src0Ptr);
512 if ((re * re + im * im) > max) {
Definition: volk_common.h:111
float f[4]
Definition: volk_common.h:115
__m128i int_vec
Definition: volk_common.h:123
uint32_t i[4]
Definition: volk_common.h:114
__m128 float_vec
Definition: volk_common.h:119
static void volk_32fc_index_max_32u_generic(uint32_t *target, lv_32fc_t *src0, uint32_t num_points)
Definition: volk_32fc_index_max_32u.h:313
static void volk_32fc_index_max_32u_a_sse3(uint32_t *target, lv_32fc_t *src0, uint32_t num_points)
Definition: volk_32fc_index_max_32u.h:199
static void volk_32fc_index_max_32u_neon(uint32_t *target, lv_32fc_t *src0, uint32_t num_points)
Definition: volk_32fc_index_max_32u.h:468
static void vector_32fc_index_max_variant1(__m256 in0, __m256 in1, __m256 *max_values, __m256i *max_indices, __m256i *current_indices, __m256i indices_increment)
Definition: volk_avx2_intrinsics.h:201
static void vector_32fc_index_max_variant0(__m256 in0, __m256 in1, __m256 *max_values, __m256i *max_indices, __m256i *current_indices, __m256i indices_increment)
Definition: volk_avx2_intrinsics.h:139
#define bit128_p(x)
Definition: volk_common.h:142
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:62
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:56
#define lv_cimag(x)
Definition: volk_complex.h:89
#define lv_creal(x)
Definition: volk_complex.h:87
float complex lv_32fc_t
Definition: volk_complex.h:65
for i
Definition: volk_config_fixed.tmpl.h:25
static float32x4_t _vmagnitudesquaredq_f32(float32x4x2_t cmplxValue)
Definition: volk_neon_intrinsics.h:87