FFmpeg coverage


Directory: ../../../ffmpeg/
File: src/libavcodec/x86/vvc/vvcdsp_init.c
Date: 2024-07-17 14:05:47
Exec Total Coverage
Lines: 47 49 95.9%
Functions: 225 225 100.0%
Branches: 18 22 81.8%

Line Branch Exec Source
1 /*
2 * VVC DSP init for x86
3 *
4 * Copyright (C) 2022-2024 Nuo Mi
5 * Copyright (c) 2023-2024 Wu Jianhua
6 *
7 * This file is part of FFmpeg.
8 *
9 * FFmpeg is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2.1 of the License, or (at your option) any later version.
13 *
14 * FFmpeg is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
18 *
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with FFmpeg; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 */
23
24 #include "config.h"
25
26 #include "libavutil/cpu.h"
27 #include "libavutil/x86/cpu.h"
28 #include "libavcodec/vvc/dec.h"
29 #include "libavcodec/vvc/ctu.h"
30 #include "libavcodec/vvc/dsp.h"
31 #include "libavcodec/x86/h26x/h2656dsp.h"
32
33 #define PUT_PROTOTYPE(name, depth, opt) \
34 void ff_vvc_put_ ## name ## _ ## depth ## _##opt(int16_t *dst, const uint8_t *src, ptrdiff_t srcstride, int height, const int8_t *hf, const int8_t *vf, int width);
35
36 #define PUT_PROTOTYPES(name, bitd, opt) \
37 PUT_PROTOTYPE(name##2, bitd, opt) \
38 PUT_PROTOTYPE(name##4, bitd, opt) \
39 PUT_PROTOTYPE(name##8, bitd, opt) \
40 PUT_PROTOTYPE(name##12, bitd, opt) \
41 PUT_PROTOTYPE(name##16, bitd, opt) \
42 PUT_PROTOTYPE(name##24, bitd, opt) \
43 PUT_PROTOTYPE(name##32, bitd, opt) \
44 PUT_PROTOTYPE(name##48, bitd, opt) \
45 PUT_PROTOTYPE(name##64, bitd, opt) \
46 PUT_PROTOTYPE(name##128, bitd, opt)
47
48 #define PUT_BPC_PROTOTYPES(name, opt) \
49 PUT_PROTOTYPES(name, 8, opt) \
50 PUT_PROTOTYPES(name, 10, opt) \
51 PUT_PROTOTYPES(name, 12, opt)
52
53 #define PUT_TAP_PROTOTYPES(n, opt) \
54 PUT_BPC_PROTOTYPES(n##tap_h, opt) \
55 PUT_BPC_PROTOTYPES(n##tap_v, opt) \
56 PUT_BPC_PROTOTYPES(n##tap_hv, opt)
57
58 PUT_BPC_PROTOTYPES(pixels, sse4)
59 PUT_BPC_PROTOTYPES(pixels, avx2)
60
61 PUT_TAP_PROTOTYPES(4, sse4)
62 PUT_TAP_PROTOTYPES(8, sse4)
63 PUT_TAP_PROTOTYPES(4, avx2)
64 PUT_TAP_PROTOTYPES(8, avx2)
65
66 #define bf(fn, bd, opt) fn##_##bd##_##opt
67 #define BF(fn, bpc, opt) fn##_##bpc##bpc_##opt
68
69 #define AVG_BPC_PROTOTYPES(bpc, opt) \
70 void BF(ff_vvc_avg, bpc, opt)(uint8_t *dst, ptrdiff_t dst_stride, \
71 const int16_t *src0, const int16_t *src1, intptr_t width, intptr_t height, intptr_t pixel_max); \
72 void BF(ff_vvc_w_avg, bpc, opt)(uint8_t *dst, ptrdiff_t dst_stride, \
73 const int16_t *src0, const int16_t *src1, intptr_t width, intptr_t height, \
74 intptr_t denom, intptr_t w0, intptr_t w1, intptr_t o0, intptr_t o1, intptr_t pixel_max);
75
76 #define AVG_PROTOTYPES(bd, opt) \
77 void bf(ff_vvc_avg, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride, \
78 const int16_t *src0, const int16_t *src1, int width, int height); \
79 void bf(ff_vvc_w_avg, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride, \
80 const int16_t *src0, const int16_t *src1, int width, int height, \
81 int denom, int w0, int w1, int o0, int o1);
82
83 AVG_BPC_PROTOTYPES( 8, avx2)
84 AVG_BPC_PROTOTYPES(16, avx2)
85
86 AVG_PROTOTYPES( 8, avx2)
87 AVG_PROTOTYPES(10, avx2)
88 AVG_PROTOTYPES(12, avx2)
89
90 #define ALF_BPC_PROTOTYPES(bpc, opt) \
91 void BF(ff_vvc_alf_filter_luma, bpc, opt)(uint8_t *dst, ptrdiff_t dst_stride, \
92 const uint8_t *src, ptrdiff_t src_stride, ptrdiff_t width, ptrdiff_t height, \
93 const int16_t *filter, const int16_t *clip, ptrdiff_t stride, ptrdiff_t vb_pos, ptrdiff_t pixel_max); \
94 void BF(ff_vvc_alf_filter_chroma, bpc, opt)(uint8_t *dst, ptrdiff_t dst_stride, \
95 const uint8_t *src, ptrdiff_t src_stride, ptrdiff_t width, ptrdiff_t height, \
96 const int16_t *filter, const int16_t *clip, ptrdiff_t stride, ptrdiff_t vb_pos, ptrdiff_t pixel_max); \
97 void BF(ff_vvc_alf_classify_grad, bpc, opt)(int *gradient_sum, \
98 const uint8_t *src, ptrdiff_t src_stride, intptr_t width, intptr_t height, intptr_t vb_pos); \
99 void BF(ff_vvc_alf_classify, bpc, opt)(int *class_idx, int *transpose_idx, const int *gradient_sum, \
100 intptr_t width, intptr_t height, intptr_t vb_pos, intptr_t bit_depth); \
101
102 #define ALF_PROTOTYPES(bpc, bd, opt) \
103 void bf(ff_vvc_alf_filter_luma, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, \
104 int width, int height, const int16_t *filter, const int16_t *clip, const int vb_pos); \
105 void bf(ff_vvc_alf_filter_chroma, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, \
106 int width, int height, const int16_t *filter, const int16_t *clip, const int vb_pos); \
107 void bf(ff_vvc_alf_classify, bd, opt)(int *class_idx, int *transpose_idx, \
108 const uint8_t *src, ptrdiff_t src_stride, int width, int height, int vb_pos, int *gradient_tmp); \
109
110 ALF_BPC_PROTOTYPES(8, avx2)
111 ALF_BPC_PROTOTYPES(16, avx2)
112
113 ALF_PROTOTYPES(8, 8, avx2)
114 ALF_PROTOTYPES(16, 10, avx2)
115 ALF_PROTOTYPES(16, 12, avx2)
116
117 #if ARCH_X86_64
118 #if HAVE_SSE4_EXTERNAL
119 #define FW_PUT(name, depth, opt) \
120 void ff_vvc_put_ ## name ## _ ## depth ## _##opt(int16_t *dst, const uint8_t *src, ptrdiff_t srcstride, \
121 int height, const int8_t *hf, const int8_t *vf, int width) \
122 { \
123 ff_h2656_put_## name ## _ ## depth ## _##opt(dst, 2 * MAX_PB_SIZE, src, srcstride, height, hf, vf, width); \
124 }
125
126 #define FW_PUT_TAP(fname, bitd, opt ) \
127 FW_PUT(fname##4, bitd, opt ) \
128 FW_PUT(fname##8, bitd, opt ) \
129 FW_PUT(fname##16, bitd, opt ) \
130 FW_PUT(fname##32, bitd, opt ) \
131 FW_PUT(fname##64, bitd, opt ) \
132 FW_PUT(fname##128, bitd, opt ) \
133
134 #define FW_PUT_4TAP(fname, bitd, opt) \
135 FW_PUT(fname ## 2, bitd, opt) \
136 FW_PUT_TAP(fname, bitd, opt)
137
138 #define FW_PUT_4TAP_SSE4(bitd) \
139 FW_PUT_4TAP(pixels, bitd, sse4) \
140 FW_PUT_4TAP(4tap_h, bitd, sse4) \
141 FW_PUT_4TAP(4tap_v, bitd, sse4) \
142 FW_PUT_4TAP(4tap_hv, bitd, sse4)
143
144 #define FW_PUT_8TAP_SSE4(bitd) \
145 FW_PUT_TAP(8tap_h, bitd, sse4) \
146 FW_PUT_TAP(8tap_v, bitd, sse4) \
147 FW_PUT_TAP(8tap_hv, bitd, sse4)
148
149 #define FW_PUT_SSE4(bitd) \
150 FW_PUT_4TAP_SSE4(bitd) \
151 FW_PUT_8TAP_SSE4(bitd)
152
153 914 FW_PUT_SSE4( 8)
154 1096 FW_PUT_SSE4(10)
155 1096 FW_PUT_SSE4(12)
156 #endif
157
158 #if HAVE_AVX2_EXTERNAL
159 #define FW_PUT_TAP_AVX2(n, bitd) \
160 FW_PUT(n ## tap_h32, bitd, avx2) \
161 FW_PUT(n ## tap_h64, bitd, avx2) \
162 FW_PUT(n ## tap_h128, bitd, avx2) \
163 FW_PUT(n ## tap_v32, bitd, avx2) \
164 FW_PUT(n ## tap_v64, bitd, avx2) \
165 FW_PUT(n ## tap_v128, bitd, avx2)
166
167 #define FW_PUT_AVX2(bitd) \
168 FW_PUT(pixels32, bitd, avx2) \
169 FW_PUT(pixels64, bitd, avx2) \
170 FW_PUT(pixels128, bitd, avx2) \
171 FW_PUT_TAP_AVX2(4, bitd) \
172 FW_PUT_TAP_AVX2(8, bitd) \
173
174 234 FW_PUT_AVX2( 8)
175 234 FW_PUT_AVX2(10)
176 234 FW_PUT_AVX2(12)
177
178 #define FW_PUT_TAP_16BPC_AVX2(n, bitd) \
179 FW_PUT(n ## tap_h16, bitd, avx2) \
180 FW_PUT(n ## tap_v16, bitd, avx2) \
181 FW_PUT(n ## tap_hv16, bitd, avx2) \
182 FW_PUT(n ## tap_hv32, bitd, avx2) \
183 FW_PUT(n ## tap_hv64, bitd, avx2) \
184 FW_PUT(n ## tap_hv128, bitd, avx2)
185
186 #define FW_PUT_16BPC_AVX2(bitd) \
187 FW_PUT(pixels16, bitd, avx2) \
188 FW_PUT_TAP_16BPC_AVX2(4, bitd) \
189 FW_PUT_TAP_16BPC_AVX2(8, bitd)
190
191 182 FW_PUT_16BPC_AVX2(10)
192 182 FW_PUT_16BPC_AVX2(12)
193
194 #define AVG_FUNCS(bpc, bd, opt) \
195 void bf(ff_vvc_avg, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride, \
196 const int16_t *src0, const int16_t *src1, int width, int height) \
197 { \
198 BF(ff_vvc_avg, bpc, opt)(dst, dst_stride, src0, src1, width, height, (1 << bd) - 1); \
199 } \
200 void bf(ff_vvc_w_avg, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride, \
201 const int16_t *src0, const int16_t *src1, int width, int height, \
202 int denom, int w0, int w1, int o0, int o1) \
203 { \
204 BF(ff_vvc_w_avg, bpc, opt)(dst, dst_stride, src0, src1, width, height, \
205 denom, w0, w1, o0, o1, (1 << bd) - 1); \
206 }
207
208 196 AVG_FUNCS(8, 8, avx2)
209 196 AVG_FUNCS(16, 10, avx2)
210 196 AVG_FUNCS(16, 12, avx2)
211
212 #define ALF_FUNCS(bpc, bd, opt) \
213 void bf(ff_vvc_alf_filter_luma, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, \
214 int width, int height, const int16_t *filter, const int16_t *clip, const int vb_pos) \
215 { \
216 const int param_stride = (width >> 2) * ALF_NUM_COEFF_LUMA; \
217 BF(ff_vvc_alf_filter_luma, bpc, opt)(dst, dst_stride, src, src_stride, width, height, \
218 filter, clip, param_stride, vb_pos, (1 << bd) - 1); \
219 } \
220 void bf(ff_vvc_alf_filter_chroma, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, \
221 int width, int height, const int16_t *filter, const int16_t *clip, const int vb_pos) \
222 { \
223 BF(ff_vvc_alf_filter_chroma, bpc, opt)(dst, dst_stride, src, src_stride, width, height, \
224 filter, clip, 0, vb_pos,(1 << bd) - 1); \
225 } \
226 void bf(ff_vvc_alf_classify, bd, opt)(int *class_idx, int *transpose_idx, \
227 const uint8_t *src, ptrdiff_t src_stride, int width, int height, int vb_pos, int *gradient_tmp) \
228 { \
229 BF(ff_vvc_alf_classify_grad, bpc, opt)(gradient_tmp, src, src_stride, width, height, vb_pos); \
230 BF(ff_vvc_alf_classify, bpc, opt)(class_idx, transpose_idx, gradient_tmp, width, height, vb_pos, bd); \
231 } \
232
233 2176 ALF_FUNCS(8, 8, avx2)
234 2176 ALF_FUNCS(16, 10, avx2)
235 2176 ALF_FUNCS(16, 12, avx2)
236
237 #endif
238
239 #define PEL_LINK(dst, C, W, idx1, idx2, name, D, opt) \
240 dst[C][W][idx1][idx2] = ff_vvc_put_## name ## _ ## D ## _##opt; \
241 dst ## _uni[C][W][idx1][idx2] = ff_h2656_put_uni_ ## name ## _ ## D ## _##opt; \
242
243 #define MC_TAP_LINKS(pointer, C, my, mx, fname, bitd, opt ) \
244 PEL_LINK(pointer, C, 1, my , mx , fname##4 , bitd, opt ); \
245 PEL_LINK(pointer, C, 2, my , mx , fname##8 , bitd, opt ); \
246 PEL_LINK(pointer, C, 3, my , mx , fname##16, bitd, opt ); \
247 PEL_LINK(pointer, C, 4, my , mx , fname##32, bitd, opt ); \
248 PEL_LINK(pointer, C, 5, my , mx , fname##64, bitd, opt ); \
249 PEL_LINK(pointer, C, 6, my , mx , fname##128, bitd, opt );
250
251 #define MC_8TAP_LINKS(pointer, my, mx, fname, bitd, opt) \
252 MC_TAP_LINKS(pointer, LUMA, my, mx, fname, bitd, opt)
253
254 #define MC_8TAP_LINKS_SSE4(bd) \
255 MC_8TAP_LINKS(c->inter.put, 0, 0, pixels, bd, sse4); \
256 MC_8TAP_LINKS(c->inter.put, 0, 1, 8tap_h, bd, sse4); \
257 MC_8TAP_LINKS(c->inter.put, 1, 0, 8tap_v, bd, sse4); \
258 MC_8TAP_LINKS(c->inter.put, 1, 1, 8tap_hv, bd, sse4)
259
260 #define MC_4TAP_LINKS(pointer, my, mx, fname, bitd, opt) \
261 PEL_LINK(pointer, CHROMA, 0, my , mx , fname##2 , bitd, opt ); \
262 MC_TAP_LINKS(pointer, CHROMA, my, mx, fname, bitd, opt) \
263
264 #define MC_4TAP_LINKS_SSE4(bd) \
265 MC_4TAP_LINKS(c->inter.put, 0, 0, pixels, bd, sse4); \
266 MC_4TAP_LINKS(c->inter.put, 0, 1, 4tap_h, bd, sse4); \
267 MC_4TAP_LINKS(c->inter.put, 1, 0, 4tap_v, bd, sse4); \
268 MC_4TAP_LINKS(c->inter.put, 1, 1, 4tap_hv, bd, sse4)
269
270 #define MC_LINK_SSE4(bd) \
271 MC_4TAP_LINKS_SSE4(bd) \
272 MC_8TAP_LINKS_SSE4(bd)
273
274 #define MC_TAP_LINKS_AVX2(C,tap,bd) do { \
275 PEL_LINK(c->inter.put, C, 4, 0, 0, pixels32, bd, avx2) \
276 PEL_LINK(c->inter.put, C, 5, 0, 0, pixels64, bd, avx2) \
277 PEL_LINK(c->inter.put, C, 6, 0, 0, pixels128, bd, avx2) \
278 PEL_LINK(c->inter.put, C, 4, 0, 1, tap##tap_h32, bd, avx2) \
279 PEL_LINK(c->inter.put, C, 5, 0, 1, tap##tap_h64, bd, avx2) \
280 PEL_LINK(c->inter.put, C, 6, 0, 1, tap##tap_h128, bd, avx2) \
281 PEL_LINK(c->inter.put, C, 4, 1, 0, tap##tap_v32, bd, avx2) \
282 PEL_LINK(c->inter.put, C, 5, 1, 0, tap##tap_v64, bd, avx2) \
283 PEL_LINK(c->inter.put, C, 6, 1, 0, tap##tap_v128, bd, avx2) \
284 } while (0)
285
286 #define MC_LINKS_AVX2(bd) \
287 MC_TAP_LINKS_AVX2(LUMA, 8, bd); \
288 MC_TAP_LINKS_AVX2(CHROMA, 4, bd);
289
290 #define MC_TAP_LINKS_16BPC_AVX2(C, tap, bd) do { \
291 PEL_LINK(c->inter.put, C, 3, 0, 0, pixels16, bd, avx2) \
292 PEL_LINK(c->inter.put, C, 3, 0, 1, tap##tap_h16, bd, avx2) \
293 PEL_LINK(c->inter.put, C, 3, 1, 0, tap##tap_v16, bd, avx2) \
294 PEL_LINK(c->inter.put, C, 3, 1, 1, tap##tap_hv16, bd, avx2) \
295 PEL_LINK(c->inter.put, C, 4, 1, 1, tap##tap_hv32, bd, avx2) \
296 PEL_LINK(c->inter.put, C, 5, 1, 1, tap##tap_hv64, bd, avx2) \
297 PEL_LINK(c->inter.put, C, 6, 1, 1, tap##tap_hv128, bd, avx2) \
298 } while (0)
299
300 #define MC_LINKS_16BPC_AVX2(bd) \
301 MC_TAP_LINKS_16BPC_AVX2(LUMA, 8, bd); \
302 MC_TAP_LINKS_16BPC_AVX2(CHROMA, 4, bd);
303
304 #define AVG_INIT(bd, opt) do { \
305 c->inter.avg = bf(ff_vvc_avg, bd, opt); \
306 c->inter.w_avg = bf(ff_vvc_w_avg, bd, opt); \
307 } while (0)
308
309 #define ALF_INIT(bd) do { \
310 c->alf.filter[LUMA] = ff_vvc_alf_filter_luma_##bd##_avx2; \
311 c->alf.filter[CHROMA] = ff_vvc_alf_filter_chroma_##bd##_avx2; \
312 c->alf.classify = ff_vvc_alf_classify_##bd##_avx2; \
313 } while (0)
314
315 int ff_vvc_sad_avx2(const int16_t *src0, const int16_t *src1, int dx, int dy, int block_w, int block_h);
316 #define SAD_INIT() c->inter.sad = ff_vvc_sad_avx2
317 #endif
318
319 1163 void ff_vvc_dsp_init_x86(VVCDSPContext *const c, const int bd)
320 {
321 #if ARCH_X86_64
322 1163 const int cpu_flags = av_get_cpu_flags();
323
324
3/4
✓ Branch 0 taken 94 times.
✓ Branch 1 taken 978 times.
✓ Branch 2 taken 91 times.
✗ Branch 3 not taken.
1163 switch (bd) {
325 94 case 8:
326
2/2
✓ Branch 0 taken 42 times.
✓ Branch 1 taken 52 times.
94 if (EXTERNAL_SSE4(cpu_flags)) {
327 42 MC_LINK_SSE4(8);
328 }
329
3/4
✓ Branch 0 taken 7 times.
✓ Branch 1 taken 87 times.
✓ Branch 2 taken 7 times.
✗ Branch 3 not taken.
94 if (EXTERNAL_AVX2_FAST(cpu_flags)) {
330 7 ALF_INIT(8);
331 7 AVG_INIT(8, avx2);
332 7 MC_LINKS_AVX2(8);
333 7 SAD_INIT();
334 }
335 94 break;
336 978 case 10:
337
2/2
✓ Branch 0 taken 48 times.
✓ Branch 1 taken 930 times.
978 if (EXTERNAL_SSE4(cpu_flags)) {
338 48 MC_LINK_SSE4(10);
339 }
340
3/4
✓ Branch 0 taken 8 times.
✓ Branch 1 taken 970 times.
✓ Branch 2 taken 8 times.
✗ Branch 3 not taken.
978 if (EXTERNAL_AVX2_FAST(cpu_flags)) {
341 8 ALF_INIT(10);
342 8 AVG_INIT(10, avx2);
343 8 MC_LINKS_AVX2(10);
344 8 MC_LINKS_16BPC_AVX2(10);
345 8 SAD_INIT();
346 }
347 978 break;
348 91 case 12:
349
2/2
✓ Branch 0 taken 42 times.
✓ Branch 1 taken 49 times.
91 if (EXTERNAL_SSE4(cpu_flags)) {
350 42 MC_LINK_SSE4(12);
351 }
352
3/4
✓ Branch 0 taken 7 times.
✓ Branch 1 taken 84 times.
✓ Branch 2 taken 7 times.
✗ Branch 3 not taken.
91 if (EXTERNAL_AVX2_FAST(cpu_flags)) {
353 7 ALF_INIT(12);
354 7 AVG_INIT(12, avx2);
355 7 MC_LINKS_AVX2(12);
356 7 MC_LINKS_16BPC_AVX2(12);
357 7 SAD_INIT();
358 }
359 91 break;
360 default:
361 break;
362 }
363 #endif
364 1163 }
365