Line | Branch | Exec | Source |
---|---|---|---|
1 | /* | ||
2 | * VP9 SIMD optimizations | ||
3 | * | ||
4 | * Copyright (c) 2013 Ronald S. Bultje <rsbultje gmail com> | ||
5 | * | ||
6 | * This file is part of FFmpeg. | ||
7 | * | ||
8 | * FFmpeg is free software; you can redistribute it and/or | ||
9 | * modify it under the terms of the GNU Lesser General Public | ||
10 | * License as published by the Free Software Foundation; either | ||
11 | * version 2.1 of the License, or (at your option) any later version. | ||
12 | * | ||
13 | * FFmpeg is distributed in the hope that it will be useful, | ||
14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
16 | * Lesser General Public License for more details. | ||
17 | * | ||
18 | * You should have received a copy of the GNU Lesser General Public | ||
19 | * License along with FFmpeg; if not, write to the Free Software | ||
20 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | ||
21 | */ | ||
22 | |||
23 | #include "libavutil/attributes.h" | ||
24 | #include "libavutil/cpu.h" | ||
25 | #include "libavutil/x86/cpu.h" | ||
26 | #include "libavcodec/vp9dsp.h" | ||
27 | #include "libavcodec/x86/vp9dsp_init.h" | ||
28 | |||
29 | #if HAVE_X86ASM | ||
30 | |||
31 | decl_fpel_func(put, 4, , mmx); | ||
32 | decl_fpel_func(put, 8, , mmx); | ||
33 | decl_fpel_func(put, 16, , sse); | ||
34 | decl_fpel_func(put, 32, , sse); | ||
35 | decl_fpel_func(put, 64, , sse); | ||
36 | decl_fpel_func(avg, 4, _8, mmxext); | ||
37 | decl_fpel_func(avg, 8, _8, mmxext); | ||
38 | decl_fpel_func(avg, 16, _8, sse2); | ||
39 | decl_fpel_func(avg, 32, _8, sse2); | ||
40 | decl_fpel_func(avg, 64, _8, sse2); | ||
41 | decl_fpel_func(put, 32, , avx); | ||
42 | decl_fpel_func(put, 64, , avx); | ||
43 | decl_fpel_func(avg, 32, _8, avx2); | ||
44 | decl_fpel_func(avg, 64, _8, avx2); | ||
45 | |||
46 | decl_mc_funcs(4, mmxext, int16_t, 8, 8); | ||
47 | decl_mc_funcs(8, sse2, int16_t, 8, 8); | ||
48 | decl_mc_funcs(4, ssse3, int8_t, 32, 8); | ||
49 | decl_mc_funcs(8, ssse3, int8_t, 32, 8); | ||
50 | #if ARCH_X86_64 | ||
51 | decl_mc_funcs(16, ssse3, int8_t, 32, 8); | ||
52 | decl_mc_funcs(32, avx2, int8_t, 32, 8); | ||
53 | decl_subpel_asm( 4, 8, avx512icl); | ||
54 | decl_subpel_asm( 8, 8, avx512icl); | ||
55 | decl_subpel_asm(16, 8, avx512icl); | ||
56 | decl_subpel_asm(32, 8, avx512icl); | ||
57 | decl_subpel_asm(64, 8, avx512icl); | ||
58 | #endif | ||
59 | |||
60 | 672 | mc_rep_funcs(16, 8, 8, sse2, int16_t, 8, 8) | |
61 | #if ARCH_X86_32 | ||
62 | mc_rep_funcs(16, 8, 8, ssse3, int8_t, 32, 8) | ||
63 | #endif | ||
64 | 288 | mc_rep_funcs(32, 16, 16, sse2, int16_t, 8, 8) | |
65 | 288 | mc_rep_funcs(32, 16, 16, ssse3, int8_t, 32, 8) | |
66 | 96 | mc_rep_funcs(64, 32, 32, sse2, int16_t, 8, 8) | |
67 | 96 | mc_rep_funcs(64, 32, 32, ssse3, int8_t, 32, 8) | |
68 | #if ARCH_X86_64 && HAVE_AVX2_EXTERNAL | ||
69 | 52 | mc_rep_funcs(64, 32, 32, avx2, int8_t, 32, 8) | |
70 | #endif | ||
71 | |||
72 | extern const int8_t ff_filters_ssse3[3][15][4][32]; | ||
73 | extern const int16_t ff_filters_sse2[3][15][8][8]; | ||
74 | |||
75 | 60 | filters_8tap_2d_fn2(put, 16, 8, 1, mmxext, sse2, sse2) | |
76 | 60 | filters_8tap_2d_fn2(avg, 16, 8, 1, mmxext, sse2, sse2) | |
77 | 894 | filters_8tap_2d_fn2(put, 16, 8, 1, ssse3, ssse3, ssse3) | |
78 | 42 | filters_8tap_2d_fn2(avg, 16, 8, 1, ssse3, ssse3, ssse3) | |
79 | #if ARCH_X86_64 && HAVE_AVX2_EXTERNAL | ||
80 | 6 | filters_8tap_2d_fn(put, 64, 32, 8, 1, avx2, ssse3) | |
81 | 10 | filters_8tap_2d_fn(put, 32, 32, 8, 1, avx2, ssse3) | |
82 | 6 | filters_8tap_2d_fn(avg, 64, 32, 8, 1, avx2, ssse3) | |
83 | 6 | filters_8tap_2d_fn(avg, 32, 32, 8, 1, avx2, ssse3) | |
84 | #endif | ||
85 | |||
86 | 120 | filters_8tap_1d_fn3(put, 8, mmxext, sse2, sse2) | |
87 | 120 | filters_8tap_1d_fn3(avg, 8, mmxext, sse2, sse2) | |
88 | 628 | filters_8tap_1d_fn3(put, 8, ssse3, ssse3, ssse3) | |
89 | 84 | filters_8tap_1d_fn3(avg, 8, ssse3, ssse3, ssse3) | |
90 | #if ARCH_X86_64 && HAVE_AVX2_EXTERNAL | ||
91 | 16 | filters_8tap_1d_fn2(put, 64, 8, avx2, ssse3) | |
92 | 50 | filters_8tap_1d_fn2(put, 32, 8, avx2, ssse3) | |
93 | 12 | filters_8tap_1d_fn2(avg, 64, 8, avx2, ssse3) | |
94 | 12 | filters_8tap_1d_fn2(avg, 32, 8, avx2, ssse3) | |
95 | #endif | ||
96 | |||
97 | #define itxfm_func(typea, typeb, size, opt) \ | ||
98 | void ff_vp9_##typea##_##typeb##_##size##x##size##_add_##opt(uint8_t *dst, ptrdiff_t stride, \ | ||
99 | int16_t *block, int eob) | ||
100 | #define itxfm_funcs(size, opt) \ | ||
101 | itxfm_func(idct, idct, size, opt); \ | ||
102 | itxfm_func(iadst, idct, size, opt); \ | ||
103 | itxfm_func(idct, iadst, size, opt); \ | ||
104 | itxfm_func(iadst, iadst, size, opt) | ||
105 | |||
106 | itxfm_func(idct, idct, 4, mmxext); | ||
107 | itxfm_func(idct, iadst, 4, sse2); | ||
108 | itxfm_func(iadst, idct, 4, sse2); | ||
109 | itxfm_func(iadst, iadst, 4, sse2); | ||
110 | itxfm_funcs(4, ssse3); | ||
111 | itxfm_funcs(8, sse2); | ||
112 | itxfm_funcs(8, ssse3); | ||
113 | itxfm_funcs(8, avx); | ||
114 | itxfm_funcs(16, sse2); | ||
115 | itxfm_funcs(16, ssse3); | ||
116 | itxfm_funcs(16, avx); | ||
117 | itxfm_func(idct, idct, 32, sse2); | ||
118 | itxfm_func(idct, idct, 32, ssse3); | ||
119 | itxfm_func(idct, idct, 32, avx); | ||
120 | itxfm_func(iwht, iwht, 4, mmx); | ||
121 | itxfm_funcs(16, avx2); | ||
122 | itxfm_funcs(16, avx512icl); | ||
123 | itxfm_func(idct, idct, 32, avx2); | ||
124 | itxfm_func(idct, idct, 32, avx512icl); | ||
125 | |||
126 | #undef itxfm_func | ||
127 | #undef itxfm_funcs | ||
128 | |||
129 | #define lpf_funcs(size1, size2, opt) \ | ||
130 | void ff_vp9_loop_filter_v_##size1##_##size2##_##opt(uint8_t *dst, ptrdiff_t stride, \ | ||
131 | int E, int I, int H); \ | ||
132 | void ff_vp9_loop_filter_h_##size1##_##size2##_##opt(uint8_t *dst, ptrdiff_t stride, \ | ||
133 | int E, int I, int H) | ||
134 | |||
135 | lpf_funcs(4, 8, mmxext); | ||
136 | lpf_funcs(8, 8, mmxext); | ||
137 | lpf_funcs(16, 16, sse2); | ||
138 | lpf_funcs(16, 16, ssse3); | ||
139 | lpf_funcs(16, 16, avx); | ||
140 | lpf_funcs(44, 16, sse2); | ||
141 | lpf_funcs(44, 16, ssse3); | ||
142 | lpf_funcs(44, 16, avx); | ||
143 | lpf_funcs(84, 16, sse2); | ||
144 | lpf_funcs(84, 16, ssse3); | ||
145 | lpf_funcs(84, 16, avx); | ||
146 | lpf_funcs(48, 16, sse2); | ||
147 | lpf_funcs(48, 16, ssse3); | ||
148 | lpf_funcs(48, 16, avx); | ||
149 | lpf_funcs(88, 16, sse2); | ||
150 | lpf_funcs(88, 16, ssse3); | ||
151 | lpf_funcs(88, 16, avx); | ||
152 | |||
153 | #undef lpf_funcs | ||
154 | |||
155 | #define ipred_func(size, type, opt) \ | ||
156 | void ff_vp9_ipred_##type##_##size##x##size##_##opt(uint8_t *dst, ptrdiff_t stride, \ | ||
157 | const uint8_t *l, const uint8_t *a) | ||
158 | |||
159 | ipred_func(8, v, mmx); | ||
160 | |||
161 | #define ipred_dc_funcs(size, opt) \ | ||
162 | ipred_func(size, dc, opt); \ | ||
163 | ipred_func(size, dc_left, opt); \ | ||
164 | ipred_func(size, dc_top, opt) | ||
165 | |||
166 | ipred_dc_funcs(4, mmxext); | ||
167 | ipred_dc_funcs(8, mmxext); | ||
168 | |||
169 | #define ipred_dir_tm_funcs(size, opt) \ | ||
170 | ipred_func(size, tm, opt); \ | ||
171 | ipred_func(size, dl, opt); \ | ||
172 | ipred_func(size, dr, opt); \ | ||
173 | ipred_func(size, hd, opt); \ | ||
174 | ipred_func(size, hu, opt); \ | ||
175 | ipred_func(size, vl, opt); \ | ||
176 | ipred_func(size, vr, opt) | ||
177 | |||
178 | ipred_dir_tm_funcs(4, mmxext); | ||
179 | |||
180 | ipred_func(16, v, sse); | ||
181 | ipred_func(32, v, sse); | ||
182 | |||
183 | ipred_dc_funcs(16, sse2); | ||
184 | ipred_dc_funcs(32, sse2); | ||
185 | |||
186 | #define ipred_dir_tm_h_funcs(size, opt) \ | ||
187 | ipred_dir_tm_funcs(size, opt); \ | ||
188 | ipred_func(size, h, opt) | ||
189 | |||
190 | ipred_dir_tm_h_funcs(8, sse2); | ||
191 | ipred_dir_tm_h_funcs(16, sse2); | ||
192 | ipred_dir_tm_h_funcs(32, sse2); | ||
193 | |||
194 | ipred_func(4, h, sse2); | ||
195 | |||
196 | #define ipred_all_funcs(size, opt) \ | ||
197 | ipred_dc_funcs(size, opt); \ | ||
198 | ipred_dir_tm_h_funcs(size, opt) | ||
199 | |||
200 | // FIXME hd/vl_4x4_ssse3 does not exist | ||
201 | ipred_all_funcs(4, ssse3); | ||
202 | ipred_all_funcs(8, ssse3); | ||
203 | ipred_all_funcs(16, ssse3); | ||
204 | ipred_all_funcs(32, ssse3); | ||
205 | |||
206 | ipred_dir_tm_h_funcs(8, avx); | ||
207 | ipred_dir_tm_h_funcs(16, avx); | ||
208 | ipred_dir_tm_h_funcs(32, avx); | ||
209 | |||
210 | ipred_all_funcs(32, avx2); | ||
211 | ipred_func(32, v, avx2); | ||
212 | |||
213 | #undef ipred_func | ||
214 | #undef ipred_dir_tm_h_funcs | ||
215 | #undef ipred_dir_tm_funcs | ||
216 | #undef ipred_dc_funcs | ||
217 | |||
218 | #endif /* HAVE_X86ASM */ | ||
219 | |||
220 | 674 | av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp, int bpp, int bitexact) | |
221 | { | ||
222 | #if HAVE_X86ASM | ||
223 | int cpu_flags; | ||
224 | |||
225 |
2/2✓ Branch 0 taken 81 times.
✓ Branch 1 taken 593 times.
|
674 | if (bpp == 10) { |
226 | 81 | ff_vp9dsp_init_10bpp_x86(dsp, bitexact); | |
227 | 81 | return; | |
228 |
2/2✓ Branch 0 taken 75 times.
✓ Branch 1 taken 518 times.
|
593 | } else if (bpp == 12) { |
229 | 75 | ff_vp9dsp_init_12bpp_x86(dsp, bitexact); | |
230 | 75 | return; | |
231 | } | ||
232 | |||
233 | 518 | cpu_flags = av_get_cpu_flags(); | |
234 | |||
235 | #define init_lpf(opt) do { \ | ||
236 | dsp->loop_filter_16[0] = ff_vp9_loop_filter_h_16_16_##opt; \ | ||
237 | dsp->loop_filter_16[1] = ff_vp9_loop_filter_v_16_16_##opt; \ | ||
238 | dsp->loop_filter_mix2[0][0][0] = ff_vp9_loop_filter_h_44_16_##opt; \ | ||
239 | dsp->loop_filter_mix2[0][0][1] = ff_vp9_loop_filter_v_44_16_##opt; \ | ||
240 | dsp->loop_filter_mix2[0][1][0] = ff_vp9_loop_filter_h_48_16_##opt; \ | ||
241 | dsp->loop_filter_mix2[0][1][1] = ff_vp9_loop_filter_v_48_16_##opt; \ | ||
242 | dsp->loop_filter_mix2[1][0][0] = ff_vp9_loop_filter_h_84_16_##opt; \ | ||
243 | dsp->loop_filter_mix2[1][0][1] = ff_vp9_loop_filter_v_84_16_##opt; \ | ||
244 | dsp->loop_filter_mix2[1][1][0] = ff_vp9_loop_filter_h_88_16_##opt; \ | ||
245 | dsp->loop_filter_mix2[1][1][1] = ff_vp9_loop_filter_v_88_16_##opt; \ | ||
246 | } while (0) | ||
247 | |||
248 | #define init_ipred(sz, opt, t, e) \ | ||
249 | dsp->intra_pred[TX_##sz##X##sz][e##_PRED] = ff_vp9_ipred_##t##_##sz##x##sz##_##opt | ||
250 | |||
251 | #define ff_vp9_ipred_hd_4x4_ssse3 ff_vp9_ipred_hd_4x4_mmxext | ||
252 | #define ff_vp9_ipred_vl_4x4_ssse3 ff_vp9_ipred_vl_4x4_mmxext | ||
253 | #define init_dir_tm_ipred(sz, opt) do { \ | ||
254 | init_ipred(sz, opt, dl, DIAG_DOWN_LEFT); \ | ||
255 | init_ipred(sz, opt, dr, DIAG_DOWN_RIGHT); \ | ||
256 | init_ipred(sz, opt, hd, HOR_DOWN); \ | ||
257 | init_ipred(sz, opt, vl, VERT_LEFT); \ | ||
258 | init_ipred(sz, opt, hu, HOR_UP); \ | ||
259 | init_ipred(sz, opt, tm, TM_VP8); \ | ||
260 | init_ipred(sz, opt, vr, VERT_RIGHT); \ | ||
261 | } while (0) | ||
262 | #define init_dir_tm_h_ipred(sz, opt) do { \ | ||
263 | init_dir_tm_ipred(sz, opt); \ | ||
264 | init_ipred(sz, opt, h, HOR); \ | ||
265 | } while (0) | ||
266 | #define init_dc_ipred(sz, opt) do { \ | ||
267 | init_ipred(sz, opt, dc, DC); \ | ||
268 | init_ipred(sz, opt, dc_left, LEFT_DC); \ | ||
269 | init_ipred(sz, opt, dc_top, TOP_DC); \ | ||
270 | } while (0) | ||
271 | #define init_all_ipred(sz, opt) do { \ | ||
272 | init_dc_ipred(sz, opt); \ | ||
273 | init_dir_tm_h_ipred(sz, opt); \ | ||
274 | } while (0) | ||
275 | |||
276 |
2/2✓ Branch 0 taken 62 times.
✓ Branch 1 taken 456 times.
|
518 | if (EXTERNAL_MMX(cpu_flags)) { |
277 | 62 | init_fpel_func(4, 0, 4, put, , mmx); | |
278 | 62 | init_fpel_func(3, 0, 8, put, , mmx); | |
279 |
1/2✓ Branch 0 taken 62 times.
✗ Branch 1 not taken.
|
62 | if (!bitexact) { |
280 | 62 | dsp->itxfm_add[4 /* lossless */][DCT_DCT] = | |
281 | 62 | dsp->itxfm_add[4 /* lossless */][ADST_DCT] = | |
282 | 62 | dsp->itxfm_add[4 /* lossless */][DCT_ADST] = | |
283 | 62 | dsp->itxfm_add[4 /* lossless */][ADST_ADST] = ff_vp9_iwht_iwht_4x4_add_mmx; | |
284 | } | ||
285 | 62 | init_ipred(8, mmx, v, VERT); | |
286 | } | ||
287 | |||
288 |
2/2✓ Branch 0 taken 57 times.
✓ Branch 1 taken 461 times.
|
518 | if (EXTERNAL_MMXEXT(cpu_flags)) { |
289 | 57 | dsp->loop_filter_8[0][0] = ff_vp9_loop_filter_h_4_8_mmxext; | |
290 | 57 | dsp->loop_filter_8[0][1] = ff_vp9_loop_filter_v_4_8_mmxext; | |
291 | 57 | dsp->loop_filter_8[1][0] = ff_vp9_loop_filter_h_8_8_mmxext; | |
292 | 57 | dsp->loop_filter_8[1][1] = ff_vp9_loop_filter_v_8_8_mmxext; | |
293 | 57 | init_subpel2(4, 0, 4, put, 8, mmxext); | |
294 | 57 | init_subpel2(4, 1, 4, avg, 8, mmxext); | |
295 | 57 | init_fpel_func(4, 1, 4, avg, _8, mmxext); | |
296 | 57 | init_fpel_func(3, 1, 8, avg, _8, mmxext); | |
297 | 57 | dsp->itxfm_add[TX_4X4][DCT_DCT] = ff_vp9_idct_idct_4x4_add_mmxext; | |
298 | 57 | init_dc_ipred(4, mmxext); | |
299 | 57 | init_dc_ipred(8, mmxext); | |
300 | 57 | init_dir_tm_ipred(4, mmxext); | |
301 | } | ||
302 | |||
303 |
2/2✓ Branch 0 taken 52 times.
✓ Branch 1 taken 466 times.
|
518 | if (EXTERNAL_SSE(cpu_flags)) { |
304 | 52 | init_fpel_func(2, 0, 16, put, , sse); | |
305 | 52 | init_fpel_func(1, 0, 32, put, , sse); | |
306 | 52 | init_fpel_func(0, 0, 64, put, , sse); | |
307 | 52 | init_ipred(16, sse, v, VERT); | |
308 | 52 | init_ipred(32, sse, v, VERT); | |
309 | } | ||
310 | |||
311 |
2/2✓ Branch 0 taken 47 times.
✓ Branch 1 taken 471 times.
|
518 | if (EXTERNAL_SSE2(cpu_flags)) { |
312 | 47 | init_subpel3_8to64(0, put, 8, sse2); | |
313 | 47 | init_subpel3_8to64(1, avg, 8, sse2); | |
314 | 47 | init_fpel_func(2, 1, 16, avg, _8, sse2); | |
315 | 47 | init_fpel_func(1, 1, 32, avg, _8, sse2); | |
316 | 47 | init_fpel_func(0, 1, 64, avg, _8, sse2); | |
317 | 47 | init_lpf(sse2); | |
318 | 47 | dsp->itxfm_add[TX_4X4][ADST_DCT] = ff_vp9_idct_iadst_4x4_add_sse2; | |
319 | 47 | dsp->itxfm_add[TX_4X4][DCT_ADST] = ff_vp9_iadst_idct_4x4_add_sse2; | |
320 | 47 | dsp->itxfm_add[TX_4X4][ADST_ADST] = ff_vp9_iadst_iadst_4x4_add_sse2; | |
321 | 47 | dsp->itxfm_add[TX_8X8][DCT_DCT] = ff_vp9_idct_idct_8x8_add_sse2; | |
322 | 47 | dsp->itxfm_add[TX_8X8][ADST_DCT] = ff_vp9_idct_iadst_8x8_add_sse2; | |
323 | 47 | dsp->itxfm_add[TX_8X8][DCT_ADST] = ff_vp9_iadst_idct_8x8_add_sse2; | |
324 | 47 | dsp->itxfm_add[TX_8X8][ADST_ADST] = ff_vp9_iadst_iadst_8x8_add_sse2; | |
325 | 47 | dsp->itxfm_add[TX_16X16][DCT_DCT] = ff_vp9_idct_idct_16x16_add_sse2; | |
326 | 47 | dsp->itxfm_add[TX_16X16][ADST_DCT] = ff_vp9_idct_iadst_16x16_add_sse2; | |
327 | 47 | dsp->itxfm_add[TX_16X16][DCT_ADST] = ff_vp9_iadst_idct_16x16_add_sse2; | |
328 | 47 | dsp->itxfm_add[TX_16X16][ADST_ADST] = ff_vp9_iadst_iadst_16x16_add_sse2; | |
329 | 47 | dsp->itxfm_add[TX_32X32][ADST_ADST] = | |
330 | 47 | dsp->itxfm_add[TX_32X32][ADST_DCT] = | |
331 | 47 | dsp->itxfm_add[TX_32X32][DCT_ADST] = | |
332 | 47 | dsp->itxfm_add[TX_32X32][DCT_DCT] = ff_vp9_idct_idct_32x32_add_sse2; | |
333 | 47 | init_dc_ipred(16, sse2); | |
334 | 47 | init_dc_ipred(32, sse2); | |
335 | 47 | init_dir_tm_h_ipred(8, sse2); | |
336 | 47 | init_dir_tm_h_ipred(16, sse2); | |
337 | 47 | init_dir_tm_h_ipred(32, sse2); | |
338 | 47 | init_ipred(4, sse2, h, HOR); | |
339 | } | ||
340 | |||
341 |
2/2✓ Branch 0 taken 37 times.
✓ Branch 1 taken 481 times.
|
518 | if (EXTERNAL_SSSE3(cpu_flags)) { |
342 | 37 | init_subpel3(0, put, 8, ssse3); | |
343 | 37 | init_subpel3(1, avg, 8, ssse3); | |
344 | 37 | dsp->itxfm_add[TX_4X4][DCT_DCT] = ff_vp9_idct_idct_4x4_add_ssse3; | |
345 | 37 | dsp->itxfm_add[TX_4X4][ADST_DCT] = ff_vp9_idct_iadst_4x4_add_ssse3; | |
346 | 37 | dsp->itxfm_add[TX_4X4][DCT_ADST] = ff_vp9_iadst_idct_4x4_add_ssse3; | |
347 | 37 | dsp->itxfm_add[TX_4X4][ADST_ADST] = ff_vp9_iadst_iadst_4x4_add_ssse3; | |
348 | 37 | dsp->itxfm_add[TX_8X8][DCT_DCT] = ff_vp9_idct_idct_8x8_add_ssse3; | |
349 | 37 | dsp->itxfm_add[TX_8X8][ADST_DCT] = ff_vp9_idct_iadst_8x8_add_ssse3; | |
350 | 37 | dsp->itxfm_add[TX_8X8][DCT_ADST] = ff_vp9_iadst_idct_8x8_add_ssse3; | |
351 | 37 | dsp->itxfm_add[TX_8X8][ADST_ADST] = ff_vp9_iadst_iadst_8x8_add_ssse3; | |
352 | 37 | dsp->itxfm_add[TX_16X16][DCT_DCT] = ff_vp9_idct_idct_16x16_add_ssse3; | |
353 | 37 | dsp->itxfm_add[TX_16X16][ADST_DCT] = ff_vp9_idct_iadst_16x16_add_ssse3; | |
354 | 37 | dsp->itxfm_add[TX_16X16][DCT_ADST] = ff_vp9_iadst_idct_16x16_add_ssse3; | |
355 | 37 | dsp->itxfm_add[TX_16X16][ADST_ADST] = ff_vp9_iadst_iadst_16x16_add_ssse3; | |
356 | 37 | dsp->itxfm_add[TX_32X32][ADST_ADST] = | |
357 | 37 | dsp->itxfm_add[TX_32X32][ADST_DCT] = | |
358 | 37 | dsp->itxfm_add[TX_32X32][DCT_ADST] = | |
359 | 37 | dsp->itxfm_add[TX_32X32][DCT_DCT] = ff_vp9_idct_idct_32x32_add_ssse3; | |
360 | 37 | init_lpf(ssse3); | |
361 | 37 | init_all_ipred(4, ssse3); | |
362 | 37 | init_all_ipred(8, ssse3); | |
363 | 37 | init_all_ipred(16, ssse3); | |
364 | 37 | init_all_ipred(32, ssse3); | |
365 | } | ||
366 | |||
367 |
2/2✓ Branch 0 taken 17 times.
✓ Branch 1 taken 501 times.
|
518 | if (EXTERNAL_AVX(cpu_flags)) { |
368 | 17 | dsp->itxfm_add[TX_8X8][DCT_DCT] = ff_vp9_idct_idct_8x8_add_avx; | |
369 | 17 | dsp->itxfm_add[TX_8X8][ADST_DCT] = ff_vp9_idct_iadst_8x8_add_avx; | |
370 | 17 | dsp->itxfm_add[TX_8X8][DCT_ADST] = ff_vp9_iadst_idct_8x8_add_avx; | |
371 | 17 | dsp->itxfm_add[TX_8X8][ADST_ADST] = ff_vp9_iadst_iadst_8x8_add_avx; | |
372 | 17 | dsp->itxfm_add[TX_16X16][DCT_DCT] = ff_vp9_idct_idct_16x16_add_avx; | |
373 | 17 | dsp->itxfm_add[TX_16X16][ADST_DCT] = ff_vp9_idct_iadst_16x16_add_avx; | |
374 | 17 | dsp->itxfm_add[TX_16X16][DCT_ADST] = ff_vp9_iadst_idct_16x16_add_avx; | |
375 | 17 | dsp->itxfm_add[TX_16X16][ADST_ADST] = ff_vp9_iadst_iadst_16x16_add_avx; | |
376 | 17 | dsp->itxfm_add[TX_32X32][ADST_ADST] = | |
377 | 17 | dsp->itxfm_add[TX_32X32][ADST_DCT] = | |
378 | 17 | dsp->itxfm_add[TX_32X32][DCT_ADST] = | |
379 | 17 | dsp->itxfm_add[TX_32X32][DCT_DCT] = ff_vp9_idct_idct_32x32_add_avx; | |
380 | 17 | init_lpf(avx); | |
381 | 17 | init_dir_tm_h_ipred(8, avx); | |
382 | 17 | init_dir_tm_h_ipred(16, avx); | |
383 | 17 | init_dir_tm_h_ipred(32, avx); | |
384 | } | ||
385 |
3/4✓ Branch 0 taken 17 times.
✓ Branch 1 taken 501 times.
✓ Branch 2 taken 17 times.
✗ Branch 3 not taken.
|
518 | if (EXTERNAL_AVX_FAST(cpu_flags)) { |
386 | 17 | init_fpel_func(1, 0, 32, put, , avx); | |
387 | 17 | init_fpel_func(0, 0, 64, put, , avx); | |
388 | } | ||
389 | |||
390 |
3/4✓ Branch 0 taken 7 times.
✓ Branch 1 taken 511 times.
✓ Branch 2 taken 7 times.
✗ Branch 3 not taken.
|
518 | if (EXTERNAL_AVX2_FAST(cpu_flags)) { |
391 | 7 | init_fpel_func(1, 1, 32, avg, _8, avx2); | |
392 | 7 | init_fpel_func(0, 1, 64, avg, _8, avx2); | |
393 | if (ARCH_X86_64) { | ||
394 | #if ARCH_X86_64 && HAVE_AVX2_EXTERNAL | ||
395 | 7 | dsp->itxfm_add[TX_16X16][DCT_DCT] = ff_vp9_idct_idct_16x16_add_avx2; | |
396 | 7 | dsp->itxfm_add[TX_16X16][ADST_DCT] = ff_vp9_idct_iadst_16x16_add_avx2; | |
397 | 7 | dsp->itxfm_add[TX_16X16][DCT_ADST] = ff_vp9_iadst_idct_16x16_add_avx2; | |
398 | 7 | dsp->itxfm_add[TX_16X16][ADST_ADST] = ff_vp9_iadst_iadst_16x16_add_avx2; | |
399 | 7 | dsp->itxfm_add[TX_32X32][ADST_ADST] = | |
400 | 7 | dsp->itxfm_add[TX_32X32][ADST_DCT] = | |
401 | 7 | dsp->itxfm_add[TX_32X32][DCT_ADST] = | |
402 | 7 | dsp->itxfm_add[TX_32X32][DCT_DCT] = ff_vp9_idct_idct_32x32_add_avx2; | |
403 | 7 | init_subpel3_32_64(0, put, 8, avx2); | |
404 | 7 | init_subpel3_32_64(1, avg, 8, avx2); | |
405 | #endif | ||
406 | } | ||
407 | 7 | init_all_ipred(32, avx2); | |
408 | 7 | init_ipred(32, avx2, v, VERT); | |
409 | } | ||
410 | |||
411 | #if ARCH_X86_64 | ||
412 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 518 times.
|
518 | if (EXTERNAL_AVX512ICL(cpu_flags)) { |
413 | ✗ | dsp->itxfm_add[TX_16X16][DCT_DCT] = ff_vp9_idct_idct_16x16_add_avx512icl; | |
414 | ✗ | dsp->itxfm_add[TX_16X16][ADST_DCT] = ff_vp9_idct_iadst_16x16_add_avx512icl; | |
415 | ✗ | dsp->itxfm_add[TX_16X16][DCT_ADST] = ff_vp9_iadst_idct_16x16_add_avx512icl; | |
416 | ✗ | dsp->itxfm_add[TX_16X16][ADST_ADST] = ff_vp9_iadst_iadst_16x16_add_avx512icl; | |
417 | ✗ | dsp->itxfm_add[TX_32X32][ADST_ADST] = | |
418 | ✗ | dsp->itxfm_add[TX_32X32][ADST_DCT] = | |
419 | ✗ | dsp->itxfm_add[TX_32X32][DCT_ADST] = | |
420 | ✗ | dsp->itxfm_add[TX_32X32][DCT_DCT] = ff_vp9_idct_idct_32x32_add_avx512icl; | |
421 | ✗ | init_subpel_asm(4, 4, 8, avx512icl); | |
422 | ✗ | init_subpel_asm(3, 8, 8, avx512icl); | |
423 | ✗ | init_subpel_asm(2, 16, 8, avx512icl); | |
424 | ✗ | init_subpel_asm(1, 32, 8, avx512icl); | |
425 | ✗ | init_subpel_asm(0, 64, 8, avx512icl); | |
426 | } | ||
427 | #endif | ||
428 | |||
429 | #undef init_fpel | ||
430 | #undef init_subpel1 | ||
431 | #undef init_subpel2 | ||
432 | #undef init_subpel3 | ||
433 | |||
434 | #endif /* HAVE_X86ASM */ | ||
435 | } | ||
436 |