Vector Optimized Library of Kernels  2.5.1
Architecture-tuned implementations of math kernels
volk_32fc_convert_16ic.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2016 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See thegit
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
46 #ifndef INCLUDED_volk_32fc_convert_16ic_a_H
47 #define INCLUDED_volk_32fc_convert_16ic_a_H
48 
49 #include "volk/volk_complex.h"
50 #include <limits.h>
51 #include <math.h>
52 
53 #ifdef LV_HAVE_AVX2
54 #include <immintrin.h>
55 
56 static inline void volk_32fc_convert_16ic_a_avx2(lv_16sc_t* outputVector,
57  const lv_32fc_t* inputVector,
58  unsigned int num_points)
59 {
60  const unsigned int avx_iters = num_points / 8;
61 
62  float* inputVectorPtr = (float*)inputVector;
63  int16_t* outputVectorPtr = (int16_t*)outputVector;
64  float aux;
65 
66  const float min_val = (float)SHRT_MIN;
67  const float max_val = (float)SHRT_MAX;
68 
69  __m256 inputVal1, inputVal2;
70  __m256i intInputVal1, intInputVal2;
71  __m256 ret1, ret2;
72  const __m256 vmin_val = _mm256_set1_ps(min_val);
73  const __m256 vmax_val = _mm256_set1_ps(max_val);
74  unsigned int i;
75 
76  for (i = 0; i < avx_iters; i++) {
77  inputVal1 = _mm256_load_ps((float*)inputVectorPtr);
78  inputVectorPtr += 8;
79  inputVal2 = _mm256_load_ps((float*)inputVectorPtr);
80  inputVectorPtr += 8;
81  __VOLK_PREFETCH(inputVectorPtr + 16);
82 
83  // Clip
84  ret1 = _mm256_max_ps(_mm256_min_ps(inputVal1, vmax_val), vmin_val);
85  ret2 = _mm256_max_ps(_mm256_min_ps(inputVal2, vmax_val), vmin_val);
86 
87  intInputVal1 = _mm256_cvtps_epi32(ret1);
88  intInputVal2 = _mm256_cvtps_epi32(ret2);
89 
90  intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
91  intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0xd8);
92 
93  _mm256_store_si256((__m256i*)outputVectorPtr, intInputVal1);
94  outputVectorPtr += 16;
95  }
96 
97  for (i = avx_iters * 16; i < num_points * 2; i++) {
98  aux = *inputVectorPtr++;
99  if (aux > max_val)
100  aux = max_val;
101  else if (aux < min_val)
102  aux = min_val;
103  *outputVectorPtr++ = (int16_t)rintf(aux);
104  }
105 }
106 #endif /* LV_HAVE_AVX2 */
107 
108 #ifdef LV_HAVE_SSE2
109 #include <emmintrin.h>
110 
111 static inline void volk_32fc_convert_16ic_a_sse2(lv_16sc_t* outputVector,
112  const lv_32fc_t* inputVector,
113  unsigned int num_points)
114 {
115  const unsigned int sse_iters = num_points / 4;
116 
117  float* inputVectorPtr = (float*)inputVector;
118  int16_t* outputVectorPtr = (int16_t*)outputVector;
119  float aux;
120 
121  const float min_val = (float)SHRT_MIN;
122  const float max_val = (float)SHRT_MAX;
123 
124  __m128 inputVal1, inputVal2;
125  __m128i intInputVal1, intInputVal2;
126  __m128 ret1, ret2;
127  const __m128 vmin_val = _mm_set_ps1(min_val);
128  const __m128 vmax_val = _mm_set_ps1(max_val);
129  unsigned int i;
130 
131  for (i = 0; i < sse_iters; i++) {
132  inputVal1 = _mm_load_ps((float*)inputVectorPtr);
133  inputVectorPtr += 4;
134  inputVal2 = _mm_load_ps((float*)inputVectorPtr);
135  inputVectorPtr += 4;
136  __VOLK_PREFETCH(inputVectorPtr + 8);
137 
138  // Clip
139  ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val);
140  ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val);
141 
142  intInputVal1 = _mm_cvtps_epi32(ret1);
143  intInputVal2 = _mm_cvtps_epi32(ret2);
144 
145  intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
146 
147  _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
148  outputVectorPtr += 8;
149  }
150 
151  for (i = sse_iters * 8; i < num_points * 2; i++) {
152  aux = *inputVectorPtr++;
153  if (aux > max_val)
154  aux = max_val;
155  else if (aux < min_val)
156  aux = min_val;
157  *outputVectorPtr++ = (int16_t)rintf(aux);
158  }
159 }
160 #endif /* LV_HAVE_SSE2 */
161 
162 
163 #if LV_HAVE_NEONV7
164 #include <arm_neon.h>
165 
166 #define VCVTRQ_S32_F32(res, val) \
167  __VOLK_ASM("VCVTR.S32.F32 %[r0], %[v0]\n\t" \
168  : [r0] "=w"(res[0]) \
169  : [v0] "w"(val[0]) \
170  :); \
171  __VOLK_ASM("VCVTR.S32.F32 %[r1], %[v1]\n\t" \
172  : [r1] "=w"(res[1]) \
173  : [v1] "w"(val[1]) \
174  :); \
175  __VOLK_ASM("VCVTR.S32.F32 %[r2], %[v2]\n\t" \
176  : [r2] "=w"(res[2]) \
177  : [v2] "w"(val[2]) \
178  :); \
179  __VOLK_ASM("VCVTR.S32.F32 %[r3], %[v3]\n\t" : [r3] "=w"(res[3]) : [v3] "w"(val[3]) :);
180 
181 static inline void volk_32fc_convert_16ic_neon(lv_16sc_t* outputVector,
182  const lv_32fc_t* inputVector,
183  unsigned int num_points)
184 {
185 
186  const unsigned int neon_iters = num_points / 4;
187 
188  float32_t* inputVectorPtr = (float32_t*)inputVector;
189  int16_t* outputVectorPtr = (int16_t*)outputVector;
190 
191  const float min_val_f = (float)SHRT_MIN;
192  const float max_val_f = (float)SHRT_MAX;
193  float32_t aux;
194  unsigned int i;
195 
196  const float32x4_t min_val = vmovq_n_f32(min_val_f);
197  const float32x4_t max_val = vmovq_n_f32(max_val_f);
198  float32x4_t ret1, ret2, a, b;
199 
200  int32x4_t toint_a = { 0, 0, 0, 0 };
201  int32x4_t toint_b = { 0, 0, 0, 0 };
202  int16x4_t intInputVal1, intInputVal2;
203  int16x8_t res;
204 
205  for (i = 0; i < neon_iters; i++) {
206  a = vld1q_f32((const float32_t*)(inputVectorPtr));
207  inputVectorPtr += 4;
208  b = vld1q_f32((const float32_t*)(inputVectorPtr));
209  inputVectorPtr += 4;
210  __VOLK_PREFETCH(inputVectorPtr + 8);
211 
212  ret1 = vmaxq_f32(vminq_f32(a, max_val), min_val);
213  ret2 = vmaxq_f32(vminq_f32(b, max_val), min_val);
214 
215  // vcvtr takes into account the current rounding mode (as does rintf)
216  VCVTRQ_S32_F32(toint_a, ret1);
217  VCVTRQ_S32_F32(toint_b, ret2);
218 
219  intInputVal1 = vqmovn_s32(toint_a);
220  intInputVal2 = vqmovn_s32(toint_b);
221 
222  res = vcombine_s16(intInputVal1, intInputVal2);
223  vst1q_s16((int16_t*)outputVectorPtr, res);
224  outputVectorPtr += 8;
225  }
226 
227  for (i = neon_iters * 8; i < num_points * 2; i++) {
228  aux = *inputVectorPtr++;
229  if (aux > max_val_f)
230  aux = max_val_f;
231  else if (aux < min_val_f)
232  aux = min_val_f;
233  *outputVectorPtr++ = (int16_t)rintf(aux);
234  }
235 }
236 
237 #undef VCVTRQ_S32_F32
238 #endif /* LV_HAVE_NEONV7 */
239 
240 #if LV_HAVE_NEONV8
241 #include <arm_neon.h>
242 
243 static inline void volk_32fc_convert_16ic_neonv8(lv_16sc_t* outputVector,
244  const lv_32fc_t* inputVector,
245  unsigned int num_points)
246 {
247  const unsigned int neon_iters = num_points / 4;
248 
249  float32_t* inputVectorPtr = (float32_t*)inputVector;
250  int16_t* outputVectorPtr = (int16_t*)outputVector;
251 
252  const float min_val_f = (float)SHRT_MIN;
253  const float max_val_f = (float)SHRT_MAX;
254  float32_t aux;
255  unsigned int i;
256 
257  const float32x4_t min_val = vmovq_n_f32(min_val_f);
258  const float32x4_t max_val = vmovq_n_f32(max_val_f);
259  float32x4_t ret1, ret2, a, b;
260 
261  int32x4_t toint_a = { 0, 0, 0, 0 }, toint_b = { 0, 0, 0, 0 };
262  int16x4_t intInputVal1, intInputVal2;
263  int16x8_t res;
264 
265  for (i = 0; i < neon_iters; i++) {
266  a = vld1q_f32((const float32_t*)(inputVectorPtr));
267  inputVectorPtr += 4;
268  b = vld1q_f32((const float32_t*)(inputVectorPtr));
269  inputVectorPtr += 4;
270  __VOLK_PREFETCH(inputVectorPtr + 8);
271 
272  ret1 = vmaxq_f32(vminq_f32(a, max_val), min_val);
273  ret2 = vmaxq_f32(vminq_f32(b, max_val), min_val);
274 
275  // vrndiq takes into account the current rounding mode (as does rintf)
276  toint_a = vcvtq_s32_f32(vrndiq_f32(ret1));
277  toint_b = vcvtq_s32_f32(vrndiq_f32(ret2));
278 
279  intInputVal1 = vqmovn_s32(toint_a);
280  intInputVal2 = vqmovn_s32(toint_b);
281 
282  res = vcombine_s16(intInputVal1, intInputVal2);
283  vst1q_s16((int16_t*)outputVectorPtr, res);
284  outputVectorPtr += 8;
285  }
286 
287  for (i = neon_iters * 8; i < num_points * 2; i++) {
288  aux = *inputVectorPtr++;
289  if (aux > max_val_f)
290  aux = max_val_f;
291  else if (aux < min_val_f)
292  aux = min_val_f;
293  *outputVectorPtr++ = (int16_t)rintf(aux);
294  }
295 }
296 #endif /* LV_HAVE_NEONV8 */
297 
298 
299 #ifdef LV_HAVE_GENERIC
300 
301 static inline void volk_32fc_convert_16ic_generic(lv_16sc_t* outputVector,
302  const lv_32fc_t* inputVector,
303  unsigned int num_points)
304 {
305  float* inputVectorPtr = (float*)inputVector;
306  int16_t* outputVectorPtr = (int16_t*)outputVector;
307  const float min_val = (float)SHRT_MIN;
308  const float max_val = (float)SHRT_MAX;
309  float aux;
310  unsigned int i;
311  for (i = 0; i < num_points * 2; i++) {
312  aux = *inputVectorPtr++;
313  if (aux > max_val)
314  aux = max_val;
315  else if (aux < min_val)
316  aux = min_val;
317  *outputVectorPtr++ = (int16_t)rintf(aux);
318  }
319 }
320 #endif /* LV_HAVE_GENERIC */
321 
322 #endif /* INCLUDED_volk_32fc_convert_16ic_a_H */
323 
324 #ifndef INCLUDED_volk_32fc_convert_16ic_u_H
325 #define INCLUDED_volk_32fc_convert_16ic_u_H
326 
327 #include "volk/volk_complex.h"
328 #include <limits.h>
329 #include <math.h>
330 
331 
332 #ifdef LV_HAVE_AVX2
333 #include <immintrin.h>
334 
335 static inline void volk_32fc_convert_16ic_u_avx2(lv_16sc_t* outputVector,
336  const lv_32fc_t* inputVector,
337  unsigned int num_points)
338 {
339  const unsigned int avx_iters = num_points / 8;
340 
341  float* inputVectorPtr = (float*)inputVector;
342  int16_t* outputVectorPtr = (int16_t*)outputVector;
343  float aux;
344 
345  const float min_val = (float)SHRT_MIN;
346  const float max_val = (float)SHRT_MAX;
347 
348  __m256 inputVal1, inputVal2;
349  __m256i intInputVal1, intInputVal2;
350  __m256 ret1, ret2;
351  const __m256 vmin_val = _mm256_set1_ps(min_val);
352  const __m256 vmax_val = _mm256_set1_ps(max_val);
353  unsigned int i;
354 
355  for (i = 0; i < avx_iters; i++) {
356  inputVal1 = _mm256_loadu_ps((float*)inputVectorPtr);
357  inputVectorPtr += 8;
358  inputVal2 = _mm256_loadu_ps((float*)inputVectorPtr);
359  inputVectorPtr += 8;
360  __VOLK_PREFETCH(inputVectorPtr + 16);
361 
362  // Clip
363  ret1 = _mm256_max_ps(_mm256_min_ps(inputVal1, vmax_val), vmin_val);
364  ret2 = _mm256_max_ps(_mm256_min_ps(inputVal2, vmax_val), vmin_val);
365 
366  intInputVal1 = _mm256_cvtps_epi32(ret1);
367  intInputVal2 = _mm256_cvtps_epi32(ret2);
368 
369  intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
370  intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0xd8);
371 
372  _mm256_storeu_si256((__m256i*)outputVectorPtr, intInputVal1);
373  outputVectorPtr += 16;
374  }
375 
376  for (i = avx_iters * 16; i < num_points * 2; i++) {
377  aux = *inputVectorPtr++;
378  if (aux > max_val)
379  aux = max_val;
380  else if (aux < min_val)
381  aux = min_val;
382  *outputVectorPtr++ = (int16_t)rintf(aux);
383  }
384 }
385 #endif /* LV_HAVE_AVX2 */
386 
387 
388 #ifdef LV_HAVE_SSE2
389 #include <emmintrin.h>
390 
391 static inline void volk_32fc_convert_16ic_u_sse2(lv_16sc_t* outputVector,
392  const lv_32fc_t* inputVector,
393  unsigned int num_points)
394 {
395  const unsigned int sse_iters = num_points / 4;
396 
397  float* inputVectorPtr = (float*)inputVector;
398  int16_t* outputVectorPtr = (int16_t*)outputVector;
399  float aux;
400 
401  const float min_val = (float)SHRT_MIN;
402  const float max_val = (float)SHRT_MAX;
403 
404  __m128 inputVal1, inputVal2;
405  __m128i intInputVal1, intInputVal2;
406  __m128 ret1, ret2;
407  const __m128 vmin_val = _mm_set_ps1(min_val);
408  const __m128 vmax_val = _mm_set_ps1(max_val);
409 
410  unsigned int i;
411  for (i = 0; i < sse_iters; i++) {
412  inputVal1 = _mm_loadu_ps((float*)inputVectorPtr);
413  inputVectorPtr += 4;
414  inputVal2 = _mm_loadu_ps((float*)inputVectorPtr);
415  inputVectorPtr += 4;
416  __VOLK_PREFETCH(inputVectorPtr + 8);
417 
418  // Clip
419  ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val);
420  ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val);
421 
422  intInputVal1 = _mm_cvtps_epi32(ret1);
423  intInputVal2 = _mm_cvtps_epi32(ret2);
424 
425  intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
426 
427  _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
428  outputVectorPtr += 8;
429  }
430 
431  for (i = sse_iters * 8; i < num_points * 2; i++) {
432  aux = *inputVectorPtr++;
433  if (aux > max_val)
434  aux = max_val;
435  else if (aux < min_val)
436  aux = min_val;
437  *outputVectorPtr++ = (int16_t)rintf(aux);
438  }
439 }
440 #endif /* LV_HAVE_SSE2 */
441 #endif /* INCLUDED_volk_32fc_convert_16ic_u_H */
static float rintf(float x)
Definition: config.h:37
static void volk_32fc_convert_16ic_a_sse2(lv_16sc_t *outputVector, const lv_32fc_t *inputVector, unsigned int num_points)
Definition: volk_32fc_convert_16ic.h:111
static void volk_32fc_convert_16ic_u_sse2(lv_16sc_t *outputVector, const lv_32fc_t *inputVector, unsigned int num_points)
Definition: volk_32fc_convert_16ic.h:391
static void volk_32fc_convert_16ic_generic(lv_16sc_t *outputVector, const lv_32fc_t *inputVector, unsigned int num_points)
Definition: volk_32fc_convert_16ic.h:301
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:62
float complex lv_32fc_t
Definition: volk_complex.h:65
short complex lv_16sc_t
Definition: volk_complex.h:62
for i
Definition: volk_config_fixed.tmpl.h:25