55 #ifndef INCLUDED_volk_16i_x4_quad_max_star_16i_a_H
56 #define INCLUDED_volk_16i_x4_quad_max_star_16i_a_H
63 #include <emmintrin.h>
70 unsigned int num_points)
72 const unsigned int num_bytes = num_points * 2;
76 int bound = (num_bytes >> 4);
77 int bound_copy = bound;
78 int leftovers = (num_bytes >> 1) & 7;
80 __m128i *p_target, *p_src0, *p_src1, *p_src2, *p_src3;
81 p_target = (__m128i*)target;
82 p_src0 = (__m128i*)src0;
83 p_src1 = (__m128i*)src1;
84 p_src2 = (__m128i*)src2;
85 p_src3 = (__m128i*)src3;
87 __m128i xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
89 while (bound_copy > 0) {
90 xmm1 = _mm_load_si128(p_src0);
91 xmm2 = _mm_load_si128(p_src1);
92 xmm3 = _mm_load_si128(p_src2);
93 xmm4 = _mm_load_si128(p_src3);
95 xmm5 = _mm_setzero_si128();
96 xmm6 = _mm_setzero_si128();
100 xmm1 = _mm_sub_epi16(xmm2, xmm1);
102 xmm3 = _mm_sub_epi16(xmm4, xmm3);
104 xmm5 = _mm_cmpgt_epi16(xmm1, xmm5);
105 xmm6 = _mm_cmpgt_epi16(xmm3, xmm6);
107 xmm2 = _mm_and_si128(xmm5, xmm2);
108 xmm4 = _mm_and_si128(xmm6, xmm4);
109 xmm5 = _mm_andnot_si128(xmm5, xmm7);
110 xmm6 = _mm_andnot_si128(xmm6, xmm8);
112 xmm5 = _mm_add_epi16(xmm2, xmm5);
113 xmm6 = _mm_add_epi16(xmm4, xmm6);
115 xmm1 = _mm_xor_si128(xmm1, xmm1);
117 xmm5 = _mm_sub_epi16(xmm6, xmm5);
121 xmm1 = _mm_cmpgt_epi16(xmm5, xmm1);
124 xmm6 = _mm_and_si128(xmm1, xmm6);
126 xmm1 = _mm_andnot_si128(xmm1, xmm2);
129 xmm1 = _mm_add_epi16(xmm6, xmm1);
132 _mm_store_si128(p_target, xmm1);
198 for (
i = bound * 8;
i < (bound * 8) + leftovers; ++
i) {
199 temp0 = ((short)(src0[
i] - src1[
i]) > 0) ? src0[
i] : src1[
i];
200 temp1 = ((short)(src2[
i] - src3[
i]) > 0) ? src2[
i] : src3[
i];
201 target[
i] = ((short)(temp0 - temp1) > 0) ? temp0 : temp1;
210 #include <arm_neon.h>
217 unsigned int num_points)
219 const unsigned int eighth_points = num_points / 8;
222 int16x8_t src0_vec, src1_vec, src2_vec, src3_vec;
223 int16x8_t diff12, diff34;
224 int16x8_t comp0, comp1, comp2, comp3;
225 int16x8_t result1_vec, result2_vec;
227 zeros = vdupq_n_s16(0);
228 for (
i = 0;
i < eighth_points; ++
i) {
229 src0_vec = vld1q_s16(src0);
230 src1_vec = vld1q_s16(src1);
231 src2_vec = vld1q_s16(src2);
232 src3_vec = vld1q_s16(src3);
233 diff12 = vsubq_s16(src0_vec, src1_vec);
234 diff34 = vsubq_s16(src2_vec, src3_vec);
235 comp0 = (int16x8_t)vcgeq_s16(diff12, zeros);
236 comp1 = (int16x8_t)vcltq_s16(diff12, zeros);
237 comp2 = (int16x8_t)vcgeq_s16(diff34, zeros);
238 comp3 = (int16x8_t)vcltq_s16(diff34, zeros);
239 comp0 = vandq_s16(src0_vec, comp0);
240 comp1 = vandq_s16(src1_vec, comp1);
241 comp2 = vandq_s16(src2_vec, comp2);
242 comp3 = vandq_s16(src3_vec, comp3);
244 result1_vec = vaddq_s16(comp0, comp1);
245 result2_vec = vaddq_s16(comp2, comp3);
247 diff12 = vsubq_s16(result1_vec, result2_vec);
248 comp0 = (int16x8_t)vcgeq_s16(diff12, zeros);
249 comp1 = (int16x8_t)vcltq_s16(diff12, zeros);
250 comp0 = vandq_s16(result1_vec, comp0);
251 comp1 = vandq_s16(result2_vec, comp1);
252 result1_vec = vaddq_s16(comp0, comp1);
253 vst1q_s16(target, result1_vec);
263 for (
i = eighth_points * 8;
i < num_points; ++
i) {
264 temp0 = ((short)(*src0 - *src1) > 0) ? *src0 : *src1;
265 temp1 = ((short)(*src2 - *src3) > 0) ? *src2 : *src3;
266 *target++ = ((short)(temp0 - temp1) > 0) ? temp0 : temp1;
276 #ifdef LV_HAVE_GENERIC
282 unsigned int num_points)
284 const unsigned int num_bytes = num_points * 2;
288 int bound = num_bytes >> 1;
292 for (
i = 0;
i < bound; ++
i) {
293 temp0 = ((short)(src0[
i] - src1[
i]) > 0) ? src0[
i] : src1[
i];
294 temp1 = ((short)(src2[
i] - src3[
i]) > 0) ? src2[
i] : src3[
i];
295 target[
i] = ((short)(temp0 - temp1) > 0) ? temp0 : temp1;
static void volk_16i_x4_quad_max_star_16i_generic(short *target, short *src0, short *src1, short *src2, short *src3, unsigned int num_points)
Definition: volk_16i_x4_quad_max_star_16i.h:277
static void volk_16i_x4_quad_max_star_16i_neon(short *target, short *src0, short *src1, short *src2, short *src3, unsigned int num_points)
Definition: volk_16i_x4_quad_max_star_16i.h:212
static void volk_16i_x4_quad_max_star_16i_a_sse2(short *target, short *src0, short *src1, short *src2, short *src3, unsigned int num_points)
Definition: volk_16i_x4_quad_max_star_16i.h:65
for i
Definition: volk_config_fixed.tmpl.h:25