libosmocore 1.9.0.196-9975
Osmocom core library
conv_acc_neon_impl.h
Go to the documentation of this file.
1
4/*
5 * (C) 2020 by sysmocom - s.f.m.c. GmbH
6 * Author: Eric Wild
7 *
8 * All Rights Reserved
9 *
10 * SPDX-License-Identifier: GPL-2.0+
11 *
12 * This program is free software; you can redistribute it and/or modify
13 * it under the terms of the GNU General Public License as published by
14 * the Free Software Foundation; either version 2 of the License, or
15 * (at your option) any later version.
16 *
17 * This program is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 * GNU General Public License for more details.
21 */
22
23/* Some distributions (notably Alpine Linux) for some strange reason
24 * don't have this #define */
25#ifndef __always_inline
26#define __always_inline inline __attribute__((always_inline))
27#endif
28
29#define NEON_BUTTERFLY(M0,M1,M2,M3,M4) \
30{ \
31 M3 = vqaddq_s16(M0, M2); \
32 M4 = vqsubq_s16(M1, M2); \
33 M0 = vqsubq_s16(M0, M2); \
34 M1 = vqaddq_s16(M1, M2); \
35 M2 = vmaxq_s16(M3, M4); \
36 M3 = vreinterpretq_s16_u16(vcgtq_s16(M3, M4)); \
37 M4 = vmaxq_s16(M0, M1); \
38 M1 = vreinterpretq_s16_u16(vcgtq_s16(M0, M1)); \
39}
40
41#define NEON_DEINTERLEAVE_K5(M0,M1,M2,M3) \
42{ \
43 int16x8x2_t tmp; \
44 tmp = vuzpq_s16(M0, M1); \
45 M2 = tmp.val[0]; \
46 M3 = tmp.val[1]; \
47}
48
49#define NEON_DEINTERLEAVE_K7(M0,M1,M2,M3,M4,M5,M6,M7,M8,M9,M10,M11,M12,M13,M14,M15) \
50{ \
51 int16x8x2_t tmp; \
52 tmp = vuzpq_s16(M0, M1); \
53 M8 = tmp.val[0]; M9 = tmp.val[1]; \
54 tmp = vuzpq_s16(M2, M3); \
55 M10 = tmp.val[0]; M11 = tmp.val[1]; \
56 tmp = vuzpq_s16(M4, M5); \
57 M12 = tmp.val[0]; M13 = tmp.val[1]; \
58 tmp = vuzpq_s16(M6, M7); \
59 M14 = tmp.val[0]; M15 = tmp.val[1]; \
60}
61
62#define NEON_BRANCH_METRIC_N2(M0,M1,M2,M3,M4,M6,M7) \
63{ \
64 M0 = vmulq_s16(M4, M0); \
65 M1 = vmulq_s16(M4, M1); \
66 M2 = vmulq_s16(M4, M2); \
67 M3 = vmulq_s16(M4, M3); \
68 M6 = vcombine_s16(vpadd_s16(vget_low_s16(M0), vget_high_s16(M0)), vpadd_s16(vget_low_s16(M1), vget_high_s16(M1))); \
69 M7 = vcombine_s16(vpadd_s16(vget_low_s16(M2), vget_high_s16(M2)), vpadd_s16(vget_low_s16(M3), vget_high_s16(M3))); \
70}
71
72#define NEON_BRANCH_METRIC_N4(M0,M1,M2,M3,M4,M5) \
73{ \
74 M0 = vmulq_s16(M4, M0); \
75 M1 = vmulq_s16(M4, M1); \
76 M2 = vmulq_s16(M4, M2); \
77 M3 = vmulq_s16(M4, M3); \
78 int16x4_t t1 = vpadd_s16(vpadd_s16(vget_low_s16(M0), vget_high_s16(M0)), vpadd_s16(vget_low_s16(M1), vget_high_s16(M1))); \
79 int16x4_t t2 = vpadd_s16(vpadd_s16(vget_low_s16(M2), vget_high_s16(M2)), vpadd_s16(vget_low_s16(M3), vget_high_s16(M3))); \
80 M5 = vcombine_s16(t1, t2); \
81}
82
83#define NEON_NORMALIZE_K5(M0,M1,M2,M3) \
84{ \
85 M2 = vminq_s16(M0, M1); \
86 int16x4_t t = vpmin_s16(vget_low_s16(M2), vget_high_s16(M2)); \
87 t = vpmin_s16(t, t); \
88 t = vpmin_s16(t, t); \
89 M2 = vdupq_lane_s16(t, 0); \
90 M0 = vqsubq_s16(M0, M2); \
91 M1 = vqsubq_s16(M1, M2); \
92}
93
94#define NEON_NORMALIZE_K7(M0,M1,M2,M3,M4,M5,M6,M7,M8,M9,M10,M11) \
95{ \
96 M8 = vminq_s16(M0, M1); \
97 M9 = vminq_s16(M2, M3); \
98 M10 = vminq_s16(M4, M5); \
99 M11 = vminq_s16(M6, M7); \
100 M8 = vminq_s16(M8, M9); \
101 M10 = vminq_s16(M10, M11); \
102 M8 = vminq_s16(M8, M10); \
103 int16x4_t t = vpmin_s16(vget_low_s16(M8), vget_high_s16(M8)); \
104 t = vpmin_s16(t, t); \
105 t = vpmin_s16(t, t); \
106 M8 = vdupq_lane_s16(t, 0); \
107 M0 = vqsubq_s16(M0, M8); \
108 M1 = vqsubq_s16(M1, M8); \
109 M2 = vqsubq_s16(M2, M8); \
110 M3 = vqsubq_s16(M3, M8); \
111 M4 = vqsubq_s16(M4, M8); \
112 M5 = vqsubq_s16(M5, M8); \
113 M6 = vqsubq_s16(M6, M8); \
114 M7 = vqsubq_s16(M7, M8); \
115}
116
117__always_inline void _neon_metrics_k5_n2(const int16_t *val, const int16_t *outa, int16_t *sumsa, int16_t *paths,
118 int norm)
119{
120 int16_t *__restrict out = __builtin_assume_aligned(outa, 8);
121 int16_t *__restrict sums = __builtin_assume_aligned(sumsa, 8);
122 int16x8_t m0, m1, m2, m3, m4, m5, m6;
123 int16x4_t input;
124
125 /* (BMU) Load and expand 8-bit input out to 16-bits */
126 input = vld1_s16(val);
127 m2 = vcombine_s16(input, input);
128
129 /* (BMU) Load and compute branch metrics */
130 m0 = vld1q_s16(&out[0]);
131 m1 = vld1q_s16(&out[8]);
132
133 m0 = vmulq_s16(m2, m0);
134 m1 = vmulq_s16(m2, m1);
135 m2 = vcombine_s16(vpadd_s16(vget_low_s16(m0), vget_high_s16(m0)),
136 vpadd_s16(vget_low_s16(m1), vget_high_s16(m1)));
137
138 /* (PMU) Load accumulated path matrics */
139 m0 = vld1q_s16(&sums[0]);
140 m1 = vld1q_s16(&sums[8]);
141
142 NEON_DEINTERLEAVE_K5(m0, m1, m3, m4)
143
144 /* (PMU) Butterflies: 0-7 */
145 NEON_BUTTERFLY(m3, m4, m2, m5, m6)
146
147 if (norm)
148 NEON_NORMALIZE_K5(m2, m6, m0, m1)
149
150 vst1q_s16(&sums[0], m2);
151 vst1q_s16(&sums[8], m6);
152 vst1q_s16(&paths[0], m5);
153 vst1q_s16(&paths[8], m4);
154}
155
156__always_inline void _neon_metrics_k5_n4(const int16_t *val, const int16_t *outa, int16_t *sumsa, int16_t *paths,
157 int norm)
158{
159 int16_t *__restrict out = __builtin_assume_aligned(outa, 8);
160 int16_t *__restrict sums = __builtin_assume_aligned(sumsa, 8);
161 int16x8_t m0, m1, m2, m3, m4, m5, m6;
162 int16x4_t input;
163
164 /* (BMU) Load and expand 8-bit input out to 16-bits */
165 input = vld1_s16(val);
166 m4 = vcombine_s16(input, input);
167
168 /* (BMU) Load and compute branch metrics */
169 m0 = vld1q_s16(&out[0]);
170 m1 = vld1q_s16(&out[8]);
171 m2 = vld1q_s16(&out[16]);
172 m3 = vld1q_s16(&out[24]);
173
174 NEON_BRANCH_METRIC_N4(m0, m1, m2, m3, m4, m2)
175
176 /* (PMU) Load accumulated path matrics */
177 m0 = vld1q_s16(&sums[0]);
178 m1 = vld1q_s16(&sums[8]);
179
180 NEON_DEINTERLEAVE_K5(m0, m1, m3, m4)
181
182 /* (PMU) Butterflies: 0-7 */
183 NEON_BUTTERFLY(m3, m4, m2, m5, m6)
184
185 if (norm)
186 NEON_NORMALIZE_K5(m2, m6, m0, m1)
187
188 vst1q_s16(&sums[0], m2);
189 vst1q_s16(&sums[8], m6);
190 vst1q_s16(&paths[0], m5);
191 vst1q_s16(&paths[8], m4);
192}
193
194__always_inline static void _neon_metrics_k7_n2(const int16_t *val, const int16_t *outa, int16_t *sumsa, int16_t *paths,
195 int norm)
196{
197 int16_t *__restrict out = __builtin_assume_aligned(outa, 8);
198 int16_t *__restrict sums = __builtin_assume_aligned(sumsa, 8);
199 int16x8_t m0, m1, m2, m3, m4, m5, m6, m7;
200 int16x8_t m8, m9, m10, m11, m12, m13, m14, m15;
201 int16x4_t input;
202
203 /* (PMU) Load accumulated path matrics */
204 m0 = vld1q_s16(&sums[0]);
205 m1 = vld1q_s16(&sums[8]);
206 m2 = vld1q_s16(&sums[16]);
207 m3 = vld1q_s16(&sums[24]);
208 m4 = vld1q_s16(&sums[32]);
209 m5 = vld1q_s16(&sums[40]);
210 m6 = vld1q_s16(&sums[48]);
211 m7 = vld1q_s16(&sums[56]);
212
213 /* (PMU) Deinterleave into even and odd packed registers */
214 NEON_DEINTERLEAVE_K7(m0, m1, m2, m3, m4, m5, m6, m7, m8, m9, m10, m11, m12, m13, m14, m15)
215
216 /* (BMU) Load and expand 8-bit input out to 16-bits */
217 input = vld1_s16(val);
218 m7 = vcombine_s16(input, input);
219
220 /* (BMU) Load and compute branch metrics */
221 m0 = vld1q_s16(&out[0]);
222 m1 = vld1q_s16(&out[8]);
223 m2 = vld1q_s16(&out[16]);
224 m3 = vld1q_s16(&out[24]);
225
226 NEON_BRANCH_METRIC_N2(m0, m1, m2, m3, m7, m4, m5)
227
228 m0 = vld1q_s16(&out[32]);
229 m1 = vld1q_s16(&out[40]);
230 m2 = vld1q_s16(&out[48]);
231 m3 = vld1q_s16(&out[56]);
232
233 NEON_BRANCH_METRIC_N2(m0, m1, m2, m3, m7, m6, m7)
234
235 /* (PMU) Butterflies: 0-15 */
236 NEON_BUTTERFLY(m8, m9, m4, m0, m1)
237 NEON_BUTTERFLY(m10, m11, m5, m2, m3)
238
239 vst1q_s16(&paths[0], m0);
240 vst1q_s16(&paths[8], m2);
241 vst1q_s16(&paths[32], m9);
242 vst1q_s16(&paths[40], m11);
243
244 /* (PMU) Butterflies: 17-31 */
245 NEON_BUTTERFLY(m12, m13, m6, m0, m2)
246 NEON_BUTTERFLY(m14, m15, m7, m9, m11)
247
248 vst1q_s16(&paths[16], m0);
249 vst1q_s16(&paths[24], m9);
250 vst1q_s16(&paths[48], m13);
251 vst1q_s16(&paths[56], m15);
252
253 if (norm)
254 NEON_NORMALIZE_K7(m4, m1, m5, m3, m6, m2, m7, m11, m0, m8, m9, m10)
255
256 vst1q_s16(&sums[0], m4);
257 vst1q_s16(&sums[8], m5);
258 vst1q_s16(&sums[16], m6);
259 vst1q_s16(&sums[24], m7);
260 vst1q_s16(&sums[32], m1);
261 vst1q_s16(&sums[40], m3);
262 vst1q_s16(&sums[48], m2);
263 vst1q_s16(&sums[56], m11);
264}
265
266__always_inline static void _neon_metrics_k7_n4(const int16_t *val, const int16_t *outa, int16_t *sumsa, int16_t *paths,
267 int norm)
268{
269 int16_t *__restrict out = __builtin_assume_aligned(outa, 8);
270 int16_t *__restrict sums = __builtin_assume_aligned(sumsa, 8);
271 int16x8_t m0, m1, m2, m3, m4, m5, m6, m7;
272 int16x8_t m8, m9, m10, m11, m12, m13, m14, m15;
273 int16x4_t input;
274
275 /* (PMU) Load accumulated path matrics */
276 m0 = vld1q_s16(&sums[0]);
277 m1 = vld1q_s16(&sums[8]);
278 m2 = vld1q_s16(&sums[16]);
279 m3 = vld1q_s16(&sums[24]);
280 m4 = vld1q_s16(&sums[32]);
281 m5 = vld1q_s16(&sums[40]);
282 m6 = vld1q_s16(&sums[48]);
283 m7 = vld1q_s16(&sums[56]);
284
285 /* (PMU) Deinterleave into even and odd packed registers */
286 NEON_DEINTERLEAVE_K7(m0, m1, m2, m3, m4, m5, m6, m7, m8, m9, m10, m11, m12, m13, m14, m15)
287
288 /* (BMU) Load and expand 8-bit input out to 16-bits */
289 input = vld1_s16(val);
290 m7 = vcombine_s16(input, input);
291
292 /* (BMU) Load and compute branch metrics */
293 m0 = vld1q_s16(&out[0]);
294 m1 = vld1q_s16(&out[8]);
295 m2 = vld1q_s16(&out[16]);
296 m3 = vld1q_s16(&out[24]);
297
298 NEON_BRANCH_METRIC_N4(m0, m1, m2, m3, m7, m4)
299
300 m0 = vld1q_s16(&out[32]);
301 m1 = vld1q_s16(&out[40]);
302 m2 = vld1q_s16(&out[48]);
303 m3 = vld1q_s16(&out[56]);
304
305 NEON_BRANCH_METRIC_N4(m0, m1, m2, m3, m7, m5)
306
307 m0 = vld1q_s16(&out[64]);
308 m1 = vld1q_s16(&out[72]);
309 m2 = vld1q_s16(&out[80]);
310 m3 = vld1q_s16(&out[88]);
311
312 NEON_BRANCH_METRIC_N4(m0, m1, m2, m3, m7, m6)
313
314 m0 = vld1q_s16(&out[96]);
315 m1 = vld1q_s16(&out[104]);
316 m2 = vld1q_s16(&out[112]);
317 m3 = vld1q_s16(&out[120]);
318
319 NEON_BRANCH_METRIC_N4(m0, m1, m2, m3, m7, m7)
320
321 /* (PMU) Butterflies: 0-15 */
322 NEON_BUTTERFLY(m8, m9, m4, m0, m1)
323 NEON_BUTTERFLY(m10, m11, m5, m2, m3)
324
325 vst1q_s16(&paths[0], m0);
326 vst1q_s16(&paths[8], m2);
327 vst1q_s16(&paths[32], m9);
328 vst1q_s16(&paths[40], m11);
329
330 /* (PMU) Butterflies: 17-31 */
331 NEON_BUTTERFLY(m12, m13, m6, m0, m2)
332 NEON_BUTTERFLY(m14, m15, m7, m9, m11)
333
334 vst1q_s16(&paths[16], m0);
335 vst1q_s16(&paths[24], m9);
336 vst1q_s16(&paths[48], m13);
337 vst1q_s16(&paths[56], m15);
338
339 if (norm)
340 NEON_NORMALIZE_K7(m4, m1, m5, m3, m6, m2, m7, m11, m0, m8, m9, m10)
341
342 vst1q_s16(&sums[0], m4);
343 vst1q_s16(&sums[8], m5);
344 vst1q_s16(&sums[16], m6);
345 vst1q_s16(&sums[24], m7);
346 vst1q_s16(&sums[32], m1);
347 vst1q_s16(&sums[40], m3);
348 vst1q_s16(&sums[48], m2);
349 vst1q_s16(&sums[56], m11);
350}
__always_inline void _neon_metrics_k5_n2(const int16_t *val, const int16_t *outa, int16_t *sumsa, int16_t *paths, int norm)
Definition: conv_acc_neon_impl.h:117
#define NEON_DEINTERLEAVE_K7(M0, M1, M2, M3, M4, M5, M6, M7, M8, M9, M10, M11, M12, M13, M14, M15)
Definition: conv_acc_neon_impl.h:49
#define NEON_BUTTERFLY(M0, M1, M2, M3, M4)
Definition: conv_acc_neon_impl.h:29
#define NEON_NORMALIZE_K7(M0, M1, M2, M3, M4, M5, M6, M7, M8, M9, M10, M11)
Definition: conv_acc_neon_impl.h:94
#define NEON_NORMALIZE_K5(M0, M1, M2, M3)
Definition: conv_acc_neon_impl.h:83
#define __always_inline
Definition: conv_acc_neon_impl.h:26
#define NEON_BRANCH_METRIC_N2(M0, M1, M2, M3, M4, M6, M7)
Definition: conv_acc_neon_impl.h:62
#define NEON_BRANCH_METRIC_N4(M0, M1, M2, M3, M4, M5)
Definition: conv_acc_neon_impl.h:72
#define NEON_DEINTERLEAVE_K5(M0, M1, M2, M3)
Definition: conv_acc_neon_impl.h:41
static __always_inline void _neon_metrics_k7_n2(const int16_t *val, const int16_t *outa, int16_t *sumsa, int16_t *paths, int norm)
Definition: conv_acc_neon_impl.h:194
__always_inline void _neon_metrics_k5_n4(const int16_t *val, const int16_t *outa, int16_t *sumsa, int16_t *paths, int norm)
Definition: conv_acc_neon_impl.h:156
static __always_inline void _neon_metrics_k7_n4(const int16_t *val, const int16_t *outa, int16_t *sumsa, int16_t *paths, int norm)
Definition: conv_acc_neon_impl.h:266