Line | Branch | Exec | Source |
---|---|---|---|
1 | /* | ||
2 | * VP9 SIMD optimizations | ||
3 | * | ||
4 | * Copyright (c) 2013 Ronald S. Bultje <rsbultje gmail com> | ||
5 | * | ||
6 | * This file is part of FFmpeg. | ||
7 | * | ||
8 | * FFmpeg is free software; you can redistribute it and/or | ||
9 | * modify it under the terms of the GNU Lesser General Public | ||
10 | * License as published by the Free Software Foundation; either | ||
11 | * version 2.1 of the License, or (at your option) any later version. | ||
12 | * | ||
13 | * FFmpeg is distributed in the hope that it will be useful, | ||
14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
16 | * Lesser General Public License for more details. | ||
17 | * | ||
18 | * You should have received a copy of the GNU Lesser General Public | ||
19 | * License along with FFmpeg; if not, write to the Free Software | ||
20 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | ||
21 | */ | ||
22 | |||
23 | #include "libavutil/attributes.h" | ||
24 | #include "libavutil/cpu.h" | ||
25 | #include "libavutil/x86/cpu.h" | ||
26 | #include "libavcodec/vp9dsp.h" | ||
27 | #include "libavcodec/x86/vp9dsp_init.h" | ||
28 | |||
29 | #if HAVE_X86ASM | ||
30 | |||
31 | decl_fpel_func(put, 4, , mmx); | ||
32 | decl_fpel_func(put, 8, , mmx); | ||
33 | decl_fpel_func(put, 16, , sse); | ||
34 | decl_fpel_func(put, 32, , sse); | ||
35 | decl_fpel_func(put, 64, , sse); | ||
36 | decl_fpel_func(avg, 4, _8, mmxext); | ||
37 | decl_fpel_func(avg, 8, _8, mmxext); | ||
38 | decl_fpel_func(avg, 16, _8, sse2); | ||
39 | decl_fpel_func(avg, 32, _8, sse2); | ||
40 | decl_fpel_func(avg, 64, _8, sse2); | ||
41 | decl_fpel_func(put, 32, , avx); | ||
42 | decl_fpel_func(put, 64, , avx); | ||
43 | decl_fpel_func(avg, 32, _8, avx2); | ||
44 | decl_fpel_func(avg, 64, _8, avx2); | ||
45 | |||
46 | decl_mc_funcs(4, mmxext, int16_t, 8, 8); | ||
47 | decl_mc_funcs(8, sse2, int16_t, 8, 8); | ||
48 | decl_mc_funcs(4, ssse3, int8_t, 32, 8); | ||
49 | decl_mc_funcs(8, ssse3, int8_t, 32, 8); | ||
50 | #if ARCH_X86_64 | ||
51 | decl_mc_funcs(16, ssse3, int8_t, 32, 8); | ||
52 | decl_mc_funcs(32, avx2, int8_t, 32, 8); | ||
53 | #endif | ||
54 | |||
55 | 672 | mc_rep_funcs(16, 8, 8, sse2, int16_t, 8, 8) | |
56 | #if ARCH_X86_32 | ||
57 | mc_rep_funcs(16, 8, 8, ssse3, int8_t, 32, 8) | ||
58 | #endif | ||
59 | 288 | mc_rep_funcs(32, 16, 16, sse2, int16_t, 8, 8) | |
60 | 288 | mc_rep_funcs(32, 16, 16, ssse3, int8_t, 32, 8) | |
61 | 96 | mc_rep_funcs(64, 32, 32, sse2, int16_t, 8, 8) | |
62 | 96 | mc_rep_funcs(64, 32, 32, ssse3, int8_t, 32, 8) | |
63 | #if ARCH_X86_64 && HAVE_AVX2_EXTERNAL | ||
64 | 52 | mc_rep_funcs(64, 32, 32, avx2, int8_t, 32, 8) | |
65 | #endif | ||
66 | |||
67 | extern const int8_t ff_filters_ssse3[3][15][4][32]; | ||
68 | extern const int16_t ff_filters_sse2[3][15][8][8]; | ||
69 | |||
70 | 60 | filters_8tap_2d_fn2(put, 16, 8, 1, mmxext, sse2, sse2) | |
71 | 60 | filters_8tap_2d_fn2(avg, 16, 8, 1, mmxext, sse2, sse2) | |
72 | 894 | filters_8tap_2d_fn2(put, 16, 8, 1, ssse3, ssse3, ssse3) | |
73 | 42 | filters_8tap_2d_fn2(avg, 16, 8, 1, ssse3, ssse3, ssse3) | |
74 | #if ARCH_X86_64 && HAVE_AVX2_EXTERNAL | ||
75 | 6 | filters_8tap_2d_fn(put, 64, 32, 8, 1, avx2, ssse3) | |
76 | 10 | filters_8tap_2d_fn(put, 32, 32, 8, 1, avx2, ssse3) | |
77 | 6 | filters_8tap_2d_fn(avg, 64, 32, 8, 1, avx2, ssse3) | |
78 | 6 | filters_8tap_2d_fn(avg, 32, 32, 8, 1, avx2, ssse3) | |
79 | #endif | ||
80 | |||
81 | 120 | filters_8tap_1d_fn3(put, 8, mmxext, sse2, sse2) | |
82 | 120 | filters_8tap_1d_fn3(avg, 8, mmxext, sse2, sse2) | |
83 | 628 | filters_8tap_1d_fn3(put, 8, ssse3, ssse3, ssse3) | |
84 | 84 | filters_8tap_1d_fn3(avg, 8, ssse3, ssse3, ssse3) | |
85 | #if ARCH_X86_64 && HAVE_AVX2_EXTERNAL | ||
86 | 16 | filters_8tap_1d_fn2(put, 64, 8, avx2, ssse3) | |
87 | 50 | filters_8tap_1d_fn2(put, 32, 8, avx2, ssse3) | |
88 | 12 | filters_8tap_1d_fn2(avg, 64, 8, avx2, ssse3) | |
89 | 12 | filters_8tap_1d_fn2(avg, 32, 8, avx2, ssse3) | |
90 | #endif | ||
91 | |||
92 | #define itxfm_func(typea, typeb, size, opt) \ | ||
93 | void ff_vp9_##typea##_##typeb##_##size##x##size##_add_##opt(uint8_t *dst, ptrdiff_t stride, \ | ||
94 | int16_t *block, int eob) | ||
95 | #define itxfm_funcs(size, opt) \ | ||
96 | itxfm_func(idct, idct, size, opt); \ | ||
97 | itxfm_func(iadst, idct, size, opt); \ | ||
98 | itxfm_func(idct, iadst, size, opt); \ | ||
99 | itxfm_func(iadst, iadst, size, opt) | ||
100 | |||
101 | itxfm_func(idct, idct, 4, mmxext); | ||
102 | itxfm_func(idct, iadst, 4, sse2); | ||
103 | itxfm_func(iadst, idct, 4, sse2); | ||
104 | itxfm_func(iadst, iadst, 4, sse2); | ||
105 | itxfm_funcs(4, ssse3); | ||
106 | itxfm_funcs(8, sse2); | ||
107 | itxfm_funcs(8, ssse3); | ||
108 | itxfm_funcs(8, avx); | ||
109 | itxfm_funcs(16, sse2); | ||
110 | itxfm_funcs(16, ssse3); | ||
111 | itxfm_funcs(16, avx); | ||
112 | itxfm_func(idct, idct, 32, sse2); | ||
113 | itxfm_func(idct, idct, 32, ssse3); | ||
114 | itxfm_func(idct, idct, 32, avx); | ||
115 | itxfm_func(iwht, iwht, 4, mmx); | ||
116 | itxfm_funcs(16, avx2); | ||
117 | itxfm_funcs(16, avx512icl); | ||
118 | itxfm_func(idct, idct, 32, avx2); | ||
119 | itxfm_func(idct, idct, 32, avx512icl); | ||
120 | |||
121 | #undef itxfm_func | ||
122 | #undef itxfm_funcs | ||
123 | |||
124 | #define lpf_funcs(size1, size2, opt) \ | ||
125 | void ff_vp9_loop_filter_v_##size1##_##size2##_##opt(uint8_t *dst, ptrdiff_t stride, \ | ||
126 | int E, int I, int H); \ | ||
127 | void ff_vp9_loop_filter_h_##size1##_##size2##_##opt(uint8_t *dst, ptrdiff_t stride, \ | ||
128 | int E, int I, int H) | ||
129 | |||
130 | lpf_funcs(4, 8, mmxext); | ||
131 | lpf_funcs(8, 8, mmxext); | ||
132 | lpf_funcs(16, 16, sse2); | ||
133 | lpf_funcs(16, 16, ssse3); | ||
134 | lpf_funcs(16, 16, avx); | ||
135 | lpf_funcs(44, 16, sse2); | ||
136 | lpf_funcs(44, 16, ssse3); | ||
137 | lpf_funcs(44, 16, avx); | ||
138 | lpf_funcs(84, 16, sse2); | ||
139 | lpf_funcs(84, 16, ssse3); | ||
140 | lpf_funcs(84, 16, avx); | ||
141 | lpf_funcs(48, 16, sse2); | ||
142 | lpf_funcs(48, 16, ssse3); | ||
143 | lpf_funcs(48, 16, avx); | ||
144 | lpf_funcs(88, 16, sse2); | ||
145 | lpf_funcs(88, 16, ssse3); | ||
146 | lpf_funcs(88, 16, avx); | ||
147 | |||
148 | #undef lpf_funcs | ||
149 | |||
150 | #define ipred_func(size, type, opt) \ | ||
151 | void ff_vp9_ipred_##type##_##size##x##size##_##opt(uint8_t *dst, ptrdiff_t stride, \ | ||
152 | const uint8_t *l, const uint8_t *a) | ||
153 | |||
154 | ipred_func(8, v, mmx); | ||
155 | |||
156 | #define ipred_dc_funcs(size, opt) \ | ||
157 | ipred_func(size, dc, opt); \ | ||
158 | ipred_func(size, dc_left, opt); \ | ||
159 | ipred_func(size, dc_top, opt) | ||
160 | |||
161 | ipred_dc_funcs(4, mmxext); | ||
162 | ipred_dc_funcs(8, mmxext); | ||
163 | |||
164 | #define ipred_dir_tm_funcs(size, opt) \ | ||
165 | ipred_func(size, tm, opt); \ | ||
166 | ipred_func(size, dl, opt); \ | ||
167 | ipred_func(size, dr, opt); \ | ||
168 | ipred_func(size, hd, opt); \ | ||
169 | ipred_func(size, hu, opt); \ | ||
170 | ipred_func(size, vl, opt); \ | ||
171 | ipred_func(size, vr, opt) | ||
172 | |||
173 | ipred_dir_tm_funcs(4, mmxext); | ||
174 | |||
175 | ipred_func(16, v, sse); | ||
176 | ipred_func(32, v, sse); | ||
177 | |||
178 | ipred_dc_funcs(16, sse2); | ||
179 | ipred_dc_funcs(32, sse2); | ||
180 | |||
181 | #define ipred_dir_tm_h_funcs(size, opt) \ | ||
182 | ipred_dir_tm_funcs(size, opt); \ | ||
183 | ipred_func(size, h, opt) | ||
184 | |||
185 | ipred_dir_tm_h_funcs(8, sse2); | ||
186 | ipred_dir_tm_h_funcs(16, sse2); | ||
187 | ipred_dir_tm_h_funcs(32, sse2); | ||
188 | |||
189 | ipred_func(4, h, sse2); | ||
190 | |||
191 | #define ipred_all_funcs(size, opt) \ | ||
192 | ipred_dc_funcs(size, opt); \ | ||
193 | ipred_dir_tm_h_funcs(size, opt) | ||
194 | |||
195 | // FIXME hd/vl_4x4_ssse3 does not exist | ||
196 | ipred_all_funcs(4, ssse3); | ||
197 | ipred_all_funcs(8, ssse3); | ||
198 | ipred_all_funcs(16, ssse3); | ||
199 | ipred_all_funcs(32, ssse3); | ||
200 | |||
201 | ipred_dir_tm_h_funcs(8, avx); | ||
202 | ipred_dir_tm_h_funcs(16, avx); | ||
203 | ipred_dir_tm_h_funcs(32, avx); | ||
204 | |||
205 | ipred_func(32, v, avx); | ||
206 | |||
207 | ipred_dc_funcs(32, avx2); | ||
208 | ipred_func(32, h, avx2); | ||
209 | ipred_func(32, tm, avx2); | ||
210 | |||
211 | #undef ipred_func | ||
212 | #undef ipred_dir_tm_h_funcs | ||
213 | #undef ipred_dir_tm_funcs | ||
214 | #undef ipred_dc_funcs | ||
215 | |||
216 | #endif /* HAVE_X86ASM */ | ||
217 | |||
218 | 674 | av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp, int bpp, int bitexact) | |
219 | { | ||
220 | #if HAVE_X86ASM | ||
221 | int cpu_flags; | ||
222 | |||
223 |
2/2✓ Branch 0 taken 81 times.
✓ Branch 1 taken 593 times.
|
674 | if (bpp == 10) { |
224 | 81 | ff_vp9dsp_init_10bpp_x86(dsp, bitexact); | |
225 | 81 | return; | |
226 |
2/2✓ Branch 0 taken 75 times.
✓ Branch 1 taken 518 times.
|
593 | } else if (bpp == 12) { |
227 | 75 | ff_vp9dsp_init_12bpp_x86(dsp, bitexact); | |
228 | 75 | return; | |
229 | } | ||
230 | |||
231 | 518 | cpu_flags = av_get_cpu_flags(); | |
232 | |||
233 | #define init_lpf(opt) do { \ | ||
234 | dsp->loop_filter_16[0] = ff_vp9_loop_filter_h_16_16_##opt; \ | ||
235 | dsp->loop_filter_16[1] = ff_vp9_loop_filter_v_16_16_##opt; \ | ||
236 | dsp->loop_filter_mix2[0][0][0] = ff_vp9_loop_filter_h_44_16_##opt; \ | ||
237 | dsp->loop_filter_mix2[0][0][1] = ff_vp9_loop_filter_v_44_16_##opt; \ | ||
238 | dsp->loop_filter_mix2[0][1][0] = ff_vp9_loop_filter_h_48_16_##opt; \ | ||
239 | dsp->loop_filter_mix2[0][1][1] = ff_vp9_loop_filter_v_48_16_##opt; \ | ||
240 | dsp->loop_filter_mix2[1][0][0] = ff_vp9_loop_filter_h_84_16_##opt; \ | ||
241 | dsp->loop_filter_mix2[1][0][1] = ff_vp9_loop_filter_v_84_16_##opt; \ | ||
242 | dsp->loop_filter_mix2[1][1][0] = ff_vp9_loop_filter_h_88_16_##opt; \ | ||
243 | dsp->loop_filter_mix2[1][1][1] = ff_vp9_loop_filter_v_88_16_##opt; \ | ||
244 | } while (0) | ||
245 | |||
246 | #define init_ipred(sz, opt, t, e) \ | ||
247 | dsp->intra_pred[TX_##sz##X##sz][e##_PRED] = ff_vp9_ipred_##t##_##sz##x##sz##_##opt | ||
248 | |||
249 | #define ff_vp9_ipred_hd_4x4_ssse3 ff_vp9_ipred_hd_4x4_mmxext | ||
250 | #define ff_vp9_ipred_vl_4x4_ssse3 ff_vp9_ipred_vl_4x4_mmxext | ||
251 | #define init_dir_tm_ipred(sz, opt) do { \ | ||
252 | init_ipred(sz, opt, dl, DIAG_DOWN_LEFT); \ | ||
253 | init_ipred(sz, opt, dr, DIAG_DOWN_RIGHT); \ | ||
254 | init_ipred(sz, opt, hd, HOR_DOWN); \ | ||
255 | init_ipred(sz, opt, vl, VERT_LEFT); \ | ||
256 | init_ipred(sz, opt, hu, HOR_UP); \ | ||
257 | init_ipred(sz, opt, tm, TM_VP8); \ | ||
258 | init_ipred(sz, opt, vr, VERT_RIGHT); \ | ||
259 | } while (0) | ||
260 | #define init_dir_tm_h_ipred(sz, opt) do { \ | ||
261 | init_dir_tm_ipred(sz, opt); \ | ||
262 | init_ipred(sz, opt, h, HOR); \ | ||
263 | } while (0) | ||
264 | #define init_dc_ipred(sz, opt) do { \ | ||
265 | init_ipred(sz, opt, dc, DC); \ | ||
266 | init_ipred(sz, opt, dc_left, LEFT_DC); \ | ||
267 | init_ipred(sz, opt, dc_top, TOP_DC); \ | ||
268 | } while (0) | ||
269 | #define init_all_ipred(sz, opt) do { \ | ||
270 | init_dc_ipred(sz, opt); \ | ||
271 | init_dir_tm_h_ipred(sz, opt); \ | ||
272 | } while (0) | ||
273 | |||
274 |
2/2✓ Branch 0 taken 62 times.
✓ Branch 1 taken 456 times.
|
518 | if (EXTERNAL_MMX(cpu_flags)) { |
275 | 62 | init_fpel_func(4, 0, 4, put, , mmx); | |
276 | 62 | init_fpel_func(3, 0, 8, put, , mmx); | |
277 |
1/2✓ Branch 0 taken 62 times.
✗ Branch 1 not taken.
|
62 | if (!bitexact) { |
278 | 62 | dsp->itxfm_add[4 /* lossless */][DCT_DCT] = | |
279 | 62 | dsp->itxfm_add[4 /* lossless */][ADST_DCT] = | |
280 | 62 | dsp->itxfm_add[4 /* lossless */][DCT_ADST] = | |
281 | 62 | dsp->itxfm_add[4 /* lossless */][ADST_ADST] = ff_vp9_iwht_iwht_4x4_add_mmx; | |
282 | } | ||
283 | 62 | init_ipred(8, mmx, v, VERT); | |
284 | } | ||
285 | |||
286 |
2/2✓ Branch 0 taken 57 times.
✓ Branch 1 taken 461 times.
|
518 | if (EXTERNAL_MMXEXT(cpu_flags)) { |
287 | 57 | dsp->loop_filter_8[0][0] = ff_vp9_loop_filter_h_4_8_mmxext; | |
288 | 57 | dsp->loop_filter_8[0][1] = ff_vp9_loop_filter_v_4_8_mmxext; | |
289 | 57 | dsp->loop_filter_8[1][0] = ff_vp9_loop_filter_h_8_8_mmxext; | |
290 | 57 | dsp->loop_filter_8[1][1] = ff_vp9_loop_filter_v_8_8_mmxext; | |
291 | 57 | init_subpel2(4, 0, 4, put, 8, mmxext); | |
292 | 57 | init_subpel2(4, 1, 4, avg, 8, mmxext); | |
293 | 57 | init_fpel_func(4, 1, 4, avg, _8, mmxext); | |
294 | 57 | init_fpel_func(3, 1, 8, avg, _8, mmxext); | |
295 | 57 | dsp->itxfm_add[TX_4X4][DCT_DCT] = ff_vp9_idct_idct_4x4_add_mmxext; | |
296 | 57 | init_dc_ipred(4, mmxext); | |
297 | 57 | init_dc_ipred(8, mmxext); | |
298 | 57 | init_dir_tm_ipred(4, mmxext); | |
299 | } | ||
300 | |||
301 |
2/2✓ Branch 0 taken 52 times.
✓ Branch 1 taken 466 times.
|
518 | if (EXTERNAL_SSE(cpu_flags)) { |
302 | 52 | init_fpel_func(2, 0, 16, put, , sse); | |
303 | 52 | init_fpel_func(1, 0, 32, put, , sse); | |
304 | 52 | init_fpel_func(0, 0, 64, put, , sse); | |
305 | 52 | init_ipred(16, sse, v, VERT); | |
306 | 52 | init_ipred(32, sse, v, VERT); | |
307 | } | ||
308 | |||
309 |
2/2✓ Branch 0 taken 47 times.
✓ Branch 1 taken 471 times.
|
518 | if (EXTERNAL_SSE2(cpu_flags)) { |
310 | 47 | init_subpel3_8to64(0, put, 8, sse2); | |
311 | 47 | init_subpel3_8to64(1, avg, 8, sse2); | |
312 | 47 | init_fpel_func(2, 1, 16, avg, _8, sse2); | |
313 | 47 | init_fpel_func(1, 1, 32, avg, _8, sse2); | |
314 | 47 | init_fpel_func(0, 1, 64, avg, _8, sse2); | |
315 | 47 | init_lpf(sse2); | |
316 | 47 | dsp->itxfm_add[TX_4X4][ADST_DCT] = ff_vp9_idct_iadst_4x4_add_sse2; | |
317 | 47 | dsp->itxfm_add[TX_4X4][DCT_ADST] = ff_vp9_iadst_idct_4x4_add_sse2; | |
318 | 47 | dsp->itxfm_add[TX_4X4][ADST_ADST] = ff_vp9_iadst_iadst_4x4_add_sse2; | |
319 | 47 | dsp->itxfm_add[TX_8X8][DCT_DCT] = ff_vp9_idct_idct_8x8_add_sse2; | |
320 | 47 | dsp->itxfm_add[TX_8X8][ADST_DCT] = ff_vp9_idct_iadst_8x8_add_sse2; | |
321 | 47 | dsp->itxfm_add[TX_8X8][DCT_ADST] = ff_vp9_iadst_idct_8x8_add_sse2; | |
322 | 47 | dsp->itxfm_add[TX_8X8][ADST_ADST] = ff_vp9_iadst_iadst_8x8_add_sse2; | |
323 | 47 | dsp->itxfm_add[TX_16X16][DCT_DCT] = ff_vp9_idct_idct_16x16_add_sse2; | |
324 | 47 | dsp->itxfm_add[TX_16X16][ADST_DCT] = ff_vp9_idct_iadst_16x16_add_sse2; | |
325 | 47 | dsp->itxfm_add[TX_16X16][DCT_ADST] = ff_vp9_iadst_idct_16x16_add_sse2; | |
326 | 47 | dsp->itxfm_add[TX_16X16][ADST_ADST] = ff_vp9_iadst_iadst_16x16_add_sse2; | |
327 | 47 | dsp->itxfm_add[TX_32X32][ADST_ADST] = | |
328 | 47 | dsp->itxfm_add[TX_32X32][ADST_DCT] = | |
329 | 47 | dsp->itxfm_add[TX_32X32][DCT_ADST] = | |
330 | 47 | dsp->itxfm_add[TX_32X32][DCT_DCT] = ff_vp9_idct_idct_32x32_add_sse2; | |
331 | 47 | init_dc_ipred(16, sse2); | |
332 | 47 | init_dc_ipred(32, sse2); | |
333 | 47 | init_dir_tm_h_ipred(8, sse2); | |
334 | 47 | init_dir_tm_h_ipred(16, sse2); | |
335 | 47 | init_dir_tm_h_ipred(32, sse2); | |
336 | 47 | init_ipred(4, sse2, h, HOR); | |
337 | } | ||
338 | |||
339 |
2/2✓ Branch 0 taken 37 times.
✓ Branch 1 taken 481 times.
|
518 | if (EXTERNAL_SSSE3(cpu_flags)) { |
340 | 37 | init_subpel3(0, put, 8, ssse3); | |
341 | 37 | init_subpel3(1, avg, 8, ssse3); | |
342 | 37 | dsp->itxfm_add[TX_4X4][DCT_DCT] = ff_vp9_idct_idct_4x4_add_ssse3; | |
343 | 37 | dsp->itxfm_add[TX_4X4][ADST_DCT] = ff_vp9_idct_iadst_4x4_add_ssse3; | |
344 | 37 | dsp->itxfm_add[TX_4X4][DCT_ADST] = ff_vp9_iadst_idct_4x4_add_ssse3; | |
345 | 37 | dsp->itxfm_add[TX_4X4][ADST_ADST] = ff_vp9_iadst_iadst_4x4_add_ssse3; | |
346 | 37 | dsp->itxfm_add[TX_8X8][DCT_DCT] = ff_vp9_idct_idct_8x8_add_ssse3; | |
347 | 37 | dsp->itxfm_add[TX_8X8][ADST_DCT] = ff_vp9_idct_iadst_8x8_add_ssse3; | |
348 | 37 | dsp->itxfm_add[TX_8X8][DCT_ADST] = ff_vp9_iadst_idct_8x8_add_ssse3; | |
349 | 37 | dsp->itxfm_add[TX_8X8][ADST_ADST] = ff_vp9_iadst_iadst_8x8_add_ssse3; | |
350 | 37 | dsp->itxfm_add[TX_16X16][DCT_DCT] = ff_vp9_idct_idct_16x16_add_ssse3; | |
351 | 37 | dsp->itxfm_add[TX_16X16][ADST_DCT] = ff_vp9_idct_iadst_16x16_add_ssse3; | |
352 | 37 | dsp->itxfm_add[TX_16X16][DCT_ADST] = ff_vp9_iadst_idct_16x16_add_ssse3; | |
353 | 37 | dsp->itxfm_add[TX_16X16][ADST_ADST] = ff_vp9_iadst_iadst_16x16_add_ssse3; | |
354 | 37 | dsp->itxfm_add[TX_32X32][ADST_ADST] = | |
355 | 37 | dsp->itxfm_add[TX_32X32][ADST_DCT] = | |
356 | 37 | dsp->itxfm_add[TX_32X32][DCT_ADST] = | |
357 | 37 | dsp->itxfm_add[TX_32X32][DCT_DCT] = ff_vp9_idct_idct_32x32_add_ssse3; | |
358 | 37 | init_lpf(ssse3); | |
359 | 37 | init_all_ipred(4, ssse3); | |
360 | 37 | init_all_ipred(8, ssse3); | |
361 | 37 | init_all_ipred(16, ssse3); | |
362 | 37 | init_all_ipred(32, ssse3); | |
363 | } | ||
364 | |||
365 |
2/2✓ Branch 0 taken 17 times.
✓ Branch 1 taken 501 times.
|
518 | if (EXTERNAL_AVX(cpu_flags)) { |
366 | 17 | dsp->itxfm_add[TX_8X8][DCT_DCT] = ff_vp9_idct_idct_8x8_add_avx; | |
367 | 17 | dsp->itxfm_add[TX_8X8][ADST_DCT] = ff_vp9_idct_iadst_8x8_add_avx; | |
368 | 17 | dsp->itxfm_add[TX_8X8][DCT_ADST] = ff_vp9_iadst_idct_8x8_add_avx; | |
369 | 17 | dsp->itxfm_add[TX_8X8][ADST_ADST] = ff_vp9_iadst_iadst_8x8_add_avx; | |
370 | 17 | dsp->itxfm_add[TX_16X16][DCT_DCT] = ff_vp9_idct_idct_16x16_add_avx; | |
371 | 17 | dsp->itxfm_add[TX_16X16][ADST_DCT] = ff_vp9_idct_iadst_16x16_add_avx; | |
372 | 17 | dsp->itxfm_add[TX_16X16][DCT_ADST] = ff_vp9_iadst_idct_16x16_add_avx; | |
373 | 17 | dsp->itxfm_add[TX_16X16][ADST_ADST] = ff_vp9_iadst_iadst_16x16_add_avx; | |
374 | 17 | dsp->itxfm_add[TX_32X32][ADST_ADST] = | |
375 | 17 | dsp->itxfm_add[TX_32X32][ADST_DCT] = | |
376 | 17 | dsp->itxfm_add[TX_32X32][DCT_ADST] = | |
377 | 17 | dsp->itxfm_add[TX_32X32][DCT_DCT] = ff_vp9_idct_idct_32x32_add_avx; | |
378 | 17 | init_lpf(avx); | |
379 | 17 | init_dir_tm_h_ipred(8, avx); | |
380 | 17 | init_dir_tm_h_ipred(16, avx); | |
381 | 17 | init_dir_tm_h_ipred(32, avx); | |
382 | } | ||
383 |
3/4✓ Branch 0 taken 17 times.
✓ Branch 1 taken 501 times.
✓ Branch 2 taken 17 times.
✗ Branch 3 not taken.
|
518 | if (EXTERNAL_AVX_FAST(cpu_flags)) { |
384 | 17 | init_fpel_func(1, 0, 32, put, , avx); | |
385 | 17 | init_fpel_func(0, 0, 64, put, , avx); | |
386 | 17 | init_ipred(32, avx, v, VERT); | |
387 | } | ||
388 | |||
389 |
3/4✓ Branch 0 taken 7 times.
✓ Branch 1 taken 511 times.
✓ Branch 2 taken 7 times.
✗ Branch 3 not taken.
|
518 | if (EXTERNAL_AVX2_FAST(cpu_flags)) { |
390 | 7 | init_fpel_func(1, 1, 32, avg, _8, avx2); | |
391 | 7 | init_fpel_func(0, 1, 64, avg, _8, avx2); | |
392 | if (ARCH_X86_64) { | ||
393 | #if ARCH_X86_64 && HAVE_AVX2_EXTERNAL | ||
394 | 7 | dsp->itxfm_add[TX_16X16][DCT_DCT] = ff_vp9_idct_idct_16x16_add_avx2; | |
395 | 7 | dsp->itxfm_add[TX_16X16][ADST_DCT] = ff_vp9_idct_iadst_16x16_add_avx2; | |
396 | 7 | dsp->itxfm_add[TX_16X16][DCT_ADST] = ff_vp9_iadst_idct_16x16_add_avx2; | |
397 | 7 | dsp->itxfm_add[TX_16X16][ADST_ADST] = ff_vp9_iadst_iadst_16x16_add_avx2; | |
398 | 7 | dsp->itxfm_add[TX_32X32][ADST_ADST] = | |
399 | 7 | dsp->itxfm_add[TX_32X32][ADST_DCT] = | |
400 | 7 | dsp->itxfm_add[TX_32X32][DCT_ADST] = | |
401 | 7 | dsp->itxfm_add[TX_32X32][DCT_DCT] = ff_vp9_idct_idct_32x32_add_avx2; | |
402 | 7 | init_subpel3_32_64(0, put, 8, avx2); | |
403 | 7 | init_subpel3_32_64(1, avg, 8, avx2); | |
404 | #endif | ||
405 | } | ||
406 | 7 | init_dc_ipred(32, avx2); | |
407 | 7 | init_ipred(32, avx2, h, HOR); | |
408 | 7 | init_ipred(32, avx2, tm, TM_VP8); | |
409 | } | ||
410 | |||
411 | #if ARCH_X86_64 | ||
412 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 518 times.
|
518 | if (EXTERNAL_AVX512ICL(cpu_flags)) { |
413 | ✗ | dsp->itxfm_add[TX_16X16][DCT_DCT] = ff_vp9_idct_idct_16x16_add_avx512icl; | |
414 | ✗ | dsp->itxfm_add[TX_16X16][ADST_DCT] = ff_vp9_idct_iadst_16x16_add_avx512icl; | |
415 | ✗ | dsp->itxfm_add[TX_16X16][DCT_ADST] = ff_vp9_iadst_idct_16x16_add_avx512icl; | |
416 | ✗ | dsp->itxfm_add[TX_16X16][ADST_ADST] = ff_vp9_iadst_iadst_16x16_add_avx512icl; | |
417 | ✗ | dsp->itxfm_add[TX_32X32][ADST_ADST] = | |
418 | ✗ | dsp->itxfm_add[TX_32X32][ADST_DCT] = | |
419 | ✗ | dsp->itxfm_add[TX_32X32][DCT_ADST] = | |
420 | ✗ | dsp->itxfm_add[TX_32X32][DCT_DCT] = ff_vp9_idct_idct_32x32_add_avx512icl; | |
421 | } | ||
422 | #endif | ||
423 | |||
424 | #undef init_fpel | ||
425 | #undef init_subpel1 | ||
426 | #undef init_subpel2 | ||
427 | #undef init_subpel3 | ||
428 | |||
429 | #endif /* HAVE_X86ASM */ | ||
430 | } | ||
431 |