Vector Optimized Library of Kernels  2.5.1
Architecture-tuned implementations of math kernels
volk_8u_x4_conv_k7_r2_8u.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
58 #ifndef INCLUDED_volk_8u_x4_conv_k7_r2_8u_H
59 #define INCLUDED_volk_8u_x4_conv_k7_r2_8u_H
60 
61 typedef union {
62  unsigned char /*DECISIONTYPE*/ t[64 /*NUMSTATES*/ / 8 /*DECISIONTYPE_BITSIZE*/];
63  unsigned int w[64 /*NUMSTATES*/ / 32];
64  unsigned short s[64 /*NUMSTATES*/ / 16];
65  unsigned char c[64 /*NUMSTATES*/ / 8];
66 #ifdef _MSC_VER
67 } decision_t;
68 #else
69 } decision_t __attribute__((aligned(16)));
70 #endif
71 
72 
73 static inline void renormalize(unsigned char* X, unsigned char threshold)
74 {
75  int NUMSTATES = 64;
76  int i;
77 
78  unsigned char min = X[0];
79  // if(min > threshold) {
80  for (i = 0; i < NUMSTATES; i++)
81  if (min > X[i])
82  min = X[i];
83  for (i = 0; i < NUMSTATES; i++)
84  X[i] -= min;
85  //}
86 }
87 
88 
89 // helper BFLY for GENERIC version
90 static inline void BFLY(int i,
91  int s,
92  unsigned char* syms,
93  unsigned char* Y,
94  unsigned char* X,
95  decision_t* d,
96  unsigned char* Branchtab)
97 {
98  int j, decision0, decision1;
99  unsigned char metric, m0, m1, m2, m3;
100 
101  int NUMSTATES = 64;
102  int RATE = 2;
103  int METRICSHIFT = 2;
104  int PRECISIONSHIFT = 2;
105 
106  metric = 0;
107  for (j = 0; j < RATE; j++)
108  metric += (Branchtab[i + j * NUMSTATES / 2] ^ syms[s * RATE + j]) >> METRICSHIFT;
109  metric = metric >> PRECISIONSHIFT;
110 
111  unsigned char max = ((RATE * ((256 - 1) >> METRICSHIFT)) >> PRECISIONSHIFT);
112 
113  m0 = X[i] + metric;
114  m1 = X[i + NUMSTATES / 2] + (max - metric);
115  m2 = X[i] + (max - metric);
116  m3 = X[i + NUMSTATES / 2] + metric;
117 
118  decision0 = (signed int)(m0 - m1) > 0;
119  decision1 = (signed int)(m2 - m3) > 0;
120 
121  Y[2 * i] = decision0 ? m1 : m0;
122  Y[2 * i + 1] = decision1 ? m3 : m2;
123 
124  d->w[i / (sizeof(unsigned int) * 8 / 2) +
125  s * (sizeof(decision_t) / sizeof(unsigned int))] |=
126  (decision0 | decision1 << 1) << ((2 * i) & (sizeof(unsigned int) * 8 - 1));
127 }
128 
129 
130 //#if LV_HAVE_AVX2
131 //
132 //#include <immintrin.h>
133 //#include <stdio.h>
134 //
135 // static inline void volk_8u_x4_conv_k7_r2_8u_avx2(unsigned char* Y,
136 // unsigned char* X,
137 // unsigned char* syms,
138 // unsigned char* dec,
139 // unsigned int framebits,
140 // unsigned int excess,
141 // unsigned char* Branchtab)
142 //{
143 // unsigned int i9;
144 // for (i9 = 0; i9 < ((framebits + excess) >> 1); i9++) {
145 // unsigned char a75, a81;
146 // int a73, a92;
147 // int s20, s21;
148 // unsigned char *a80, *b6;
149 // int *a110, *a91, *a93;
150 // __m256i *a112, *a71, *a72, *a77, *a83, *a95;
151 // __m256i a86, a87;
152 // __m256i a76, a78, a79, a82, a84, a85, a88, a89, a90, d10, d9, m23, m24, m25,
153 // m26,
154 // s18, s19, s22, s23, s24, s25, t13, t14, t15;
155 // a71 = ((__m256i*)X);
156 // s18 = *(a71);
157 // a72 = (a71 + 1);
158 // s19 = *(a72);
159 // s22 = _mm256_permute2x128_si256(s18, s19, 0x20);
160 // s19 = _mm256_permute2x128_si256(s18, s19, 0x31);
161 // s18 = s22;
162 // a73 = (4 * i9);
163 // b6 = (syms + a73);
164 // a75 = *(b6);
165 // a76 = _mm256_set1_epi8(a75);
166 // a77 = ((__m256i*)Branchtab);
167 // a78 = *(a77);
168 // a79 = _mm256_xor_si256(a76, a78);
169 // a80 = (b6 + 1);
170 // a81 = *(a80);
171 // a82 = _mm256_set1_epi8(a81);
172 // a83 = (a77 + 1);
173 // a84 = *(a83);
174 // a85 = _mm256_xor_si256(a82, a84);
175 // t13 = _mm256_avg_epu8(a79, a85);
176 // a86 = ((__m256i)t13);
177 // a87 = _mm256_srli_epi16(a86, 2);
178 // a88 = ((__m256i)a87);
179 // t14 = _mm256_and_si256(a88, _mm256_set1_epi8(63));
180 // t15 = _mm256_subs_epu8(_mm256_set1_epi8(63), t14);
181 // m23 = _mm256_adds_epu8(s18, t14);
182 // m24 = _mm256_adds_epu8(s19, t15);
183 // m25 = _mm256_adds_epu8(s18, t15);
184 // m26 = _mm256_adds_epu8(s19, t14);
185 // a89 = _mm256_min_epu8(m24, m23);
186 // d9 = _mm256_cmpeq_epi8(a89, m24);
187 // a90 = _mm256_min_epu8(m26, m25);
188 // d10 = _mm256_cmpeq_epi8(a90, m26);
189 // s22 = _mm256_unpacklo_epi8(d9, d10);
190 // s23 = _mm256_unpackhi_epi8(d9, d10);
191 // s20 = _mm256_movemask_epi8(_mm256_permute2x128_si256(s22, s23, 0x20));
192 // a91 = ((int*)dec);
193 // a92 = (4 * i9);
194 // a93 = (a91 + a92);
195 // *(a93) = s20;
196 // s21 = _mm256_movemask_epi8(_mm256_permute2x128_si256(s22, s23, 0x31));
197 // a110 = (a93 + 1);
198 // *(a110) = s21;
199 // s22 = _mm256_unpacklo_epi8(a89, a90);
200 // s23 = _mm256_unpackhi_epi8(a89, a90);
201 // a95 = ((__m256i*)Y);
202 // s24 = _mm256_permute2x128_si256(s22, s23, 0x20);
203 // *(a95) = s24;
204 // s23 = _mm256_permute2x128_si256(s22, s23, 0x31);
205 // a112 = (a95 + 1);
206 // *(a112) = s23;
207 // if ((((unsigned char*)Y)[0] > 210)) {
208 // __m256i m5, m6;
209 // m5 = ((__m256i*)Y)[0];
210 // m5 = _mm256_min_epu8(m5, ((__m256i*)Y)[1]);
211 // __m256i m7;
212 // m7 = _mm256_min_epu8(_mm256_srli_si256(m5, 8), m5);
213 // m7 = ((__m256i)_mm256_min_epu8(((__m256i)_mm256_srli_epi64(m7, 32)),
214 // ((__m256i)m7)));
215 // m7 = ((__m256i)_mm256_min_epu8(((__m256i)_mm256_srli_epi64(m7, 16)),
216 // ((__m256i)m7)));
217 // m7 = ((__m256i)_mm256_min_epu8(((__m256i)_mm256_srli_epi64(m7, 8)),
218 // ((__m256i)m7)));
219 // m7 = _mm256_unpacklo_epi8(m7, m7);
220 // m7 = _mm256_shufflelo_epi16(m7, 0);
221 // m6 = _mm256_unpacklo_epi64(m7, m7);
222 // m6 = _mm256_permute2x128_si256(
223 // m6, m6, 0); // copy lower half of m6 to upper half, since above ops
224 // // operate on 128 bit lanes
225 // ((__m256i*)Y)[0] = _mm256_subs_epu8(((__m256i*)Y)[0], m6);
226 // ((__m256i*)Y)[1] = _mm256_subs_epu8(((__m256i*)Y)[1], m6);
227 // }
228 // unsigned char a188, a194;
229 // int a205;
230 // int s48, s54;
231 // unsigned char *a187, *a193;
232 // int *a204, *a206, *a223, *b16;
233 // __m256i *a184, *a185, *a190, *a196, *a208, *a225;
234 // __m256i a199, a200;
235 // __m256i a189, a191, a192, a195, a197, a198, a201, a202, a203, d17, d18, m39,
236 // m40,
237 // m41, m42, s46, s47, s50, s51, t25, t26, t27;
238 // a184 = ((__m256i*)Y);
239 // s46 = *(a184);
240 // a185 = (a184 + 1);
241 // s47 = *(a185);
242 // s50 = _mm256_permute2x128_si256(s46, s47, 0x20);
243 // s47 = _mm256_permute2x128_si256(s46, s47, 0x31);
244 // s46 = s50;
245 // a187 = (b6 + 2);
246 // a188 = *(a187);
247 // a189 = _mm256_set1_epi8(a188);
248 // a190 = ((__m256i*)Branchtab);
249 // a191 = *(a190);
250 // a192 = _mm256_xor_si256(a189, a191);
251 // a193 = (b6 + 3);
252 // a194 = *(a193);
253 // a195 = _mm256_set1_epi8(a194);
254 // a196 = (a190 + 1);
255 // a197 = *(a196);
256 // a198 = _mm256_xor_si256(a195, a197);
257 // t25 = _mm256_avg_epu8(a192, a198);
258 // a199 = ((__m256i)t25);
259 // a200 = _mm256_srli_epi16(a199, 2);
260 // a201 = ((__m256i)a200);
261 // t26 = _mm256_and_si256(a201, _mm256_set1_epi8(63));
262 // t27 = _mm256_subs_epu8(_mm256_set1_epi8(63), t26);
263 // m39 = _mm256_adds_epu8(s46, t26);
264 // m40 = _mm256_adds_epu8(s47, t27);
265 // m41 = _mm256_adds_epu8(s46, t27);
266 // m42 = _mm256_adds_epu8(s47, t26);
267 // a202 = _mm256_min_epu8(m40, m39);
268 // d17 = _mm256_cmpeq_epi8(a202, m40);
269 // a203 = _mm256_min_epu8(m42, m41);
270 // d18 = _mm256_cmpeq_epi8(a203, m42);
271 // s24 = _mm256_unpacklo_epi8(d17, d18);
272 // s25 = _mm256_unpackhi_epi8(d17, d18);
273 // s48 = _mm256_movemask_epi8(_mm256_permute2x128_si256(s24, s25, 0x20));
274 // a204 = ((int*)dec);
275 // a205 = (4 * i9);
276 // b16 = (a204 + a205);
277 // a206 = (b16 + 2);
278 // *(a206) = s48;
279 // s54 = _mm256_movemask_epi8(_mm256_permute2x128_si256(s24, s25, 0x31));
280 // a223 = (b16 + 3);
281 // *(a223) = s54;
282 // s50 = _mm256_unpacklo_epi8(a202, a203);
283 // s51 = _mm256_unpackhi_epi8(a202, a203);
284 // s25 = _mm256_permute2x128_si256(s50, s51, 0x20);
285 // s51 = _mm256_permute2x128_si256(s50, s51, 0x31);
286 // a208 = ((__m256i*)X);
287 // *(a208) = s25;
288 // a225 = (a208 + 1);
289 // *(a225) = s51;
290 //
291 // if ((((unsigned char*)X)[0] > 210)) {
292 // __m256i m12, m13;
293 // m12 = ((__m256i*)X)[0];
294 // m12 = _mm256_min_epu8(m12, ((__m256i*)X)[1]);
295 // __m256i m14;
296 // m14 = _mm256_min_epu8(_mm256_srli_si256(m12, 8), m12);
297 // m14 = ((__m256i)_mm256_min_epu8(((__m256i)_mm256_srli_epi64(m14, 32)),
298 // ((__m256i)m14)));
299 // m14 = ((__m256i)_mm256_min_epu8(((__m256i)_mm256_srli_epi64(m14, 16)),
300 // ((__m256i)m14)));
301 // m14 = ((__m256i)_mm256_min_epu8(((__m256i)_mm256_srli_epi64(m14, 8)),
302 // ((__m256i)m14)));
303 // m14 = _mm256_unpacklo_epi8(m14, m14);
304 // m14 = _mm256_shufflelo_epi16(m14, 0);
305 // m13 = _mm256_unpacklo_epi64(m14, m14);
306 // m13 = _mm256_permute2x128_si256(m13, m13, 0);
307 // ((__m256i*)X)[0] = _mm256_subs_epu8(((__m256i*)X)[0], m13);
308 // ((__m256i*)X)[1] = _mm256_subs_epu8(((__m256i*)X)[1], m13);
309 // }
310 // }
311 //
312 // renormalize(X, 210);
313 //
314 // unsigned int j;
315 // for (j = 0; j < (framebits + excess) % 2; ++j) {
316 // int i;
317 // for (i = 0; i < 64 / 2; i++) {
318 // BFLY(i,
319 // (((framebits + excess) >> 1) << 1) + j,
320 // syms,
321 // Y,
322 // X,
323 // (decision_t*)dec,
324 // Branchtab);
325 // }
326 //
327 // renormalize(Y, 210);
328 // }
329 // /*skip*/
330 //}
331 //
332 //#endif /*LV_HAVE_AVX2*/
333 
334 
335 #if LV_HAVE_SSE3
336 
337 #include <emmintrin.h>
338 #include <mmintrin.h>
339 #include <pmmintrin.h>
340 #include <stdio.h>
341 #include <xmmintrin.h>
342 
343 static inline void volk_8u_x4_conv_k7_r2_8u_spiral(unsigned char* Y,
344  unsigned char* X,
345  unsigned char* syms,
346  unsigned char* dec,
347  unsigned int framebits,
348  unsigned int excess,
349  unsigned char* Branchtab)
350 {
351  unsigned int i9;
352  for (i9 = 0; i9 < ((framebits + excess) >> 1); i9++) {
353  unsigned char a75, a81;
354  int a73, a92;
355  short int s20, s21, s26, s27;
356  unsigned char *a74, *a80, *b6;
357  short int *a110, *a111, *a91, *a93, *a94;
358  __m128i *a102, *a112, *a113, *a71, *a72, *a77, *a83, *a95, *a96, *a97, *a98, *a99;
359  __m128i a105, a106, a86, a87;
360  __m128i a100, a101, a103, a104, a107, a108, a109, a76, a78, a79, a82, a84, a85,
361  a88, a89, a90, d10, d11, d12, d9, m23, m24, m25, m26, m27, m28, m29, m30, s18,
362  s19, s22, s23, s24, s25, s28, s29, t13, t14, t15, t16, t17, t18;
363  a71 = ((__m128i*)X);
364  s18 = *(a71);
365  a72 = (a71 + 2);
366  s19 = *(a72);
367  a73 = (4 * i9);
368  a74 = (syms + a73);
369  a75 = *(a74);
370  a76 = _mm_set1_epi8(a75);
371  a77 = ((__m128i*)Branchtab);
372  a78 = *(a77);
373  a79 = _mm_xor_si128(a76, a78);
374  b6 = (a73 + syms);
375  a80 = (b6 + 1);
376  a81 = *(a80);
377  a82 = _mm_set1_epi8(a81);
378  a83 = (a77 + 2);
379  a84 = *(a83);
380  a85 = _mm_xor_si128(a82, a84);
381  t13 = _mm_avg_epu8(a79, a85);
382  a86 = ((__m128i)t13);
383  a87 = _mm_srli_epi16(a86, 2);
384  a88 = ((__m128i)a87);
385  t14 = _mm_and_si128(
386  a88,
387  _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63));
388  t15 = _mm_subs_epu8(
389  _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63),
390  t14);
391  m23 = _mm_adds_epu8(s18, t14);
392  m24 = _mm_adds_epu8(s19, t15);
393  m25 = _mm_adds_epu8(s18, t15);
394  m26 = _mm_adds_epu8(s19, t14);
395  a89 = _mm_min_epu8(m24, m23);
396  d9 = _mm_cmpeq_epi8(a89, m24);
397  a90 = _mm_min_epu8(m26, m25);
398  d10 = _mm_cmpeq_epi8(a90, m26);
399  s20 = _mm_movemask_epi8(_mm_unpacklo_epi8(d9, d10));
400  a91 = ((short int*)dec);
401  a92 = (8 * i9);
402  a93 = (a91 + a92);
403  *(a93) = s20;
404  s21 = _mm_movemask_epi8(_mm_unpackhi_epi8(d9, d10));
405  a94 = (a93 + 1);
406  *(a94) = s21;
407  s22 = _mm_unpacklo_epi8(a89, a90);
408  s23 = _mm_unpackhi_epi8(a89, a90);
409  a95 = ((__m128i*)Y);
410  *(a95) = s22;
411  a96 = (a95 + 1);
412  *(a96) = s23;
413  a97 = (a71 + 1);
414  s24 = *(a97);
415  a98 = (a71 + 3);
416  s25 = *(a98);
417  a99 = (a77 + 1);
418  a100 = *(a99);
419  a101 = _mm_xor_si128(a76, a100);
420  a102 = (a77 + 3);
421  a103 = *(a102);
422  a104 = _mm_xor_si128(a82, a103);
423  t16 = _mm_avg_epu8(a101, a104);
424  a105 = ((__m128i)t16);
425  a106 = _mm_srli_epi16(a105, 2);
426  a107 = ((__m128i)a106);
427  t17 = _mm_and_si128(
428  a107,
429  _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63));
430  t18 = _mm_subs_epu8(
431  _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63),
432  t17);
433  m27 = _mm_adds_epu8(s24, t17);
434  m28 = _mm_adds_epu8(s25, t18);
435  m29 = _mm_adds_epu8(s24, t18);
436  m30 = _mm_adds_epu8(s25, t17);
437  a108 = _mm_min_epu8(m28, m27);
438  d11 = _mm_cmpeq_epi8(a108, m28);
439  a109 = _mm_min_epu8(m30, m29);
440  d12 = _mm_cmpeq_epi8(a109, m30);
441  s26 = _mm_movemask_epi8(_mm_unpacklo_epi8(d11, d12));
442  a110 = (a93 + 2);
443  *(a110) = s26;
444  s27 = _mm_movemask_epi8(_mm_unpackhi_epi8(d11, d12));
445  a111 = (a93 + 3);
446  *(a111) = s27;
447  s28 = _mm_unpacklo_epi8(a108, a109);
448  s29 = _mm_unpackhi_epi8(a108, a109);
449  a112 = (a95 + 2);
450  *(a112) = s28;
451  a113 = (a95 + 3);
452  *(a113) = s29;
453  if ((((unsigned char*)Y)[0] > 210)) {
454  __m128i m5, m6;
455  m5 = ((__m128i*)Y)[0];
456  m5 = _mm_min_epu8(m5, ((__m128i*)Y)[1]);
457  m5 = _mm_min_epu8(m5, ((__m128i*)Y)[2]);
458  m5 = _mm_min_epu8(m5, ((__m128i*)Y)[3]);
459  __m128i m7;
460  m7 = _mm_min_epu8(_mm_srli_si128(m5, 8), m5);
461  m7 =
462  ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m7, 32)), ((__m128i)m7)));
463  m7 =
464  ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m7, 16)), ((__m128i)m7)));
465  m7 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m7, 8)), ((__m128i)m7)));
466  m7 = _mm_unpacklo_epi8(m7, m7);
467  m7 = _mm_shufflelo_epi16(m7, _MM_SHUFFLE(0, 0, 0, 0));
468  m6 = _mm_unpacklo_epi64(m7, m7);
469  ((__m128i*)Y)[0] = _mm_subs_epu8(((__m128i*)Y)[0], m6);
470  ((__m128i*)Y)[1] = _mm_subs_epu8(((__m128i*)Y)[1], m6);
471  ((__m128i*)Y)[2] = _mm_subs_epu8(((__m128i*)Y)[2], m6);
472  ((__m128i*)Y)[3] = _mm_subs_epu8(((__m128i*)Y)[3], m6);
473  }
474  unsigned char a188, a194;
475  int a186, a205;
476  short int s48, s49, s54, s55;
477  unsigned char *a187, *a193, *b15;
478  short int *a204, *a206, *a207, *a223, *a224, *b16;
479  __m128i *a184, *a185, *a190, *a196, *a208, *a209, *a210, *a211, *a212, *a215,
480  *a225, *a226;
481  __m128i a199, a200, a218, a219;
482  __m128i a189, a191, a192, a195, a197, a198, a201, a202, a203, a213, a214, a216,
483  a217, a220, a221, a222, d17, d18, d19, d20, m39, m40, m41, m42, m43, m44, m45,
484  m46, s46, s47, s50, s51, s52, s53, s56, s57, t25, t26, t27, t28, t29, t30;
485  a184 = ((__m128i*)Y);
486  s46 = *(a184);
487  a185 = (a184 + 2);
488  s47 = *(a185);
489  a186 = (4 * i9);
490  b15 = (a186 + syms);
491  a187 = (b15 + 2);
492  a188 = *(a187);
493  a189 = _mm_set1_epi8(a188);
494  a190 = ((__m128i*)Branchtab);
495  a191 = *(a190);
496  a192 = _mm_xor_si128(a189, a191);
497  a193 = (b15 + 3);
498  a194 = *(a193);
499  a195 = _mm_set1_epi8(a194);
500  a196 = (a190 + 2);
501  a197 = *(a196);
502  a198 = _mm_xor_si128(a195, a197);
503  t25 = _mm_avg_epu8(a192, a198);
504  a199 = ((__m128i)t25);
505  a200 = _mm_srli_epi16(a199, 2);
506  a201 = ((__m128i)a200);
507  t26 = _mm_and_si128(
508  a201,
509  _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63));
510  t27 = _mm_subs_epu8(
511  _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63),
512  t26);
513  m39 = _mm_adds_epu8(s46, t26);
514  m40 = _mm_adds_epu8(s47, t27);
515  m41 = _mm_adds_epu8(s46, t27);
516  m42 = _mm_adds_epu8(s47, t26);
517  a202 = _mm_min_epu8(m40, m39);
518  d17 = _mm_cmpeq_epi8(a202, m40);
519  a203 = _mm_min_epu8(m42, m41);
520  d18 = _mm_cmpeq_epi8(a203, m42);
521  s48 = _mm_movemask_epi8(_mm_unpacklo_epi8(d17, d18));
522  a204 = ((short int*)dec);
523  a205 = (8 * i9);
524  b16 = (a204 + a205);
525  a206 = (b16 + 4);
526  *(a206) = s48;
527  s49 = _mm_movemask_epi8(_mm_unpackhi_epi8(d17, d18));
528  a207 = (b16 + 5);
529  *(a207) = s49;
530  s50 = _mm_unpacklo_epi8(a202, a203);
531  s51 = _mm_unpackhi_epi8(a202, a203);
532  a208 = ((__m128i*)X);
533  *(a208) = s50;
534  a209 = (a208 + 1);
535  *(a209) = s51;
536  a210 = (a184 + 1);
537  s52 = *(a210);
538  a211 = (a184 + 3);
539  s53 = *(a211);
540  a212 = (a190 + 1);
541  a213 = *(a212);
542  a214 = _mm_xor_si128(a189, a213);
543  a215 = (a190 + 3);
544  a216 = *(a215);
545  a217 = _mm_xor_si128(a195, a216);
546  t28 = _mm_avg_epu8(a214, a217);
547  a218 = ((__m128i)t28);
548  a219 = _mm_srli_epi16(a218, 2);
549  a220 = ((__m128i)a219);
550  t29 = _mm_and_si128(
551  a220,
552  _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63));
553  t30 = _mm_subs_epu8(
554  _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63),
555  t29);
556  m43 = _mm_adds_epu8(s52, t29);
557  m44 = _mm_adds_epu8(s53, t30);
558  m45 = _mm_adds_epu8(s52, t30);
559  m46 = _mm_adds_epu8(s53, t29);
560  a221 = _mm_min_epu8(m44, m43);
561  d19 = _mm_cmpeq_epi8(a221, m44);
562  a222 = _mm_min_epu8(m46, m45);
563  d20 = _mm_cmpeq_epi8(a222, m46);
564  s54 = _mm_movemask_epi8(_mm_unpacklo_epi8(d19, d20));
565  a223 = (b16 + 6);
566  *(a223) = s54;
567  s55 = _mm_movemask_epi8(_mm_unpackhi_epi8(d19, d20));
568  a224 = (b16 + 7);
569  *(a224) = s55;
570  s56 = _mm_unpacklo_epi8(a221, a222);
571  s57 = _mm_unpackhi_epi8(a221, a222);
572  a225 = (a208 + 2);
573  *(a225) = s56;
574  a226 = (a208 + 3);
575  *(a226) = s57;
576  if ((((unsigned char*)X)[0] > 210)) {
577  __m128i m12, m13;
578  m12 = ((__m128i*)X)[0];
579  m12 = _mm_min_epu8(m12, ((__m128i*)X)[1]);
580  m12 = _mm_min_epu8(m12, ((__m128i*)X)[2]);
581  m12 = _mm_min_epu8(m12, ((__m128i*)X)[3]);
582  __m128i m14;
583  m14 = _mm_min_epu8(_mm_srli_si128(m12, 8), m12);
584  m14 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m14, 32)),
585  ((__m128i)m14)));
586  m14 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m14, 16)),
587  ((__m128i)m14)));
588  m14 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m14, 8)),
589  ((__m128i)m14)));
590  m14 = _mm_unpacklo_epi8(m14, m14);
591  m14 = _mm_shufflelo_epi16(m14, _MM_SHUFFLE(0, 0, 0, 0));
592  m13 = _mm_unpacklo_epi64(m14, m14);
593  ((__m128i*)X)[0] = _mm_subs_epu8(((__m128i*)X)[0], m13);
594  ((__m128i*)X)[1] = _mm_subs_epu8(((__m128i*)X)[1], m13);
595  ((__m128i*)X)[2] = _mm_subs_epu8(((__m128i*)X)[2], m13);
596  ((__m128i*)X)[3] = _mm_subs_epu8(((__m128i*)X)[3], m13);
597  }
598  }
599 
600  renormalize(X, 210);
601 
602  /*int ch;
603  for(ch = 0; ch < 64; ch++) {
604  printf("%d,", X[ch]);
605  }
606  printf("\n");*/
607 
608  unsigned int j;
609  for (j = 0; j < (framebits + excess) % 2; ++j) {
610  int i;
611  for (i = 0; i < 64 / 2; i++) {
612  BFLY(i,
613  (((framebits + excess) >> 1) << 1) + j,
614  syms,
615  Y,
616  X,
617  (decision_t*)dec,
618  Branchtab);
619  }
620 
621 
622  renormalize(Y, 210);
623 
624  /*printf("\n");
625  for(ch = 0; ch < 64; ch++) {
626  printf("%d,", Y[ch]);
627  }
628  printf("\n");*/
629  }
630  /*skip*/
631 }
632 
633 #endif /*LV_HAVE_SSE3*/
634 
635 
636 #if LV_HAVE_GENERIC
637 
638 static inline void volk_8u_x4_conv_k7_r2_8u_generic(unsigned char* Y,
639  unsigned char* X,
640  unsigned char* syms,
641  unsigned char* dec,
642  unsigned int framebits,
643  unsigned int excess,
644  unsigned char* Branchtab)
645 {
646  int nbits = framebits + excess;
647  int NUMSTATES = 64;
648  int RENORMALIZE_THRESHOLD = 210;
649 
650  int s, i;
651  for (s = 0; s < nbits; s++) {
652  void* tmp;
653  for (i = 0; i < NUMSTATES / 2; i++) {
654  BFLY(i, s, syms, Y, X, (decision_t*)dec, Branchtab);
655  }
656 
657  renormalize(Y, RENORMALIZE_THRESHOLD);
658 
660  tmp = (void*)X;
661  X = Y;
662  Y = (unsigned char*)tmp;
663  }
664 }
665 
666 #endif /* LV_HAVE_GENERIC */
667 
668 #endif /*INCLUDED_volk_8u_x4_conv_k7_r2_8u_H*/
Definition: volk_8u_x4_conv_k7_r2_8u.h:61
unsigned int w[64/32]
Definition: volk_8u_x4_conv_k7_r2_8u.h:63
static void BFLY(int i, int s, unsigned char *syms, unsigned char *Y, unsigned char *X, decision_t *d, unsigned char *Branchtab)
Definition: volk_8u_x4_conv_k7_r2_8u.h:90
static void volk_8u_x4_conv_k7_r2_8u_spiral(unsigned char *Y, unsigned char *X, unsigned char *syms, unsigned char *dec, unsigned int framebits, unsigned int excess, unsigned char *Branchtab)
Definition: volk_8u_x4_conv_k7_r2_8u.h:343
static void volk_8u_x4_conv_k7_r2_8u_generic(unsigned char *Y, unsigned char *X, unsigned char *syms, unsigned char *dec, unsigned int framebits, unsigned int excess, unsigned char *Branchtab)
Definition: volk_8u_x4_conv_k7_r2_8u.h:638
static void renormalize(unsigned char *X, unsigned char threshold)
Definition: volk_8u_x4_conv_k7_r2_8u.h:73
for i
Definition: volk_config_fixed.tmpl.h:25