FFmpeg coverage


Directory: ../../../ffmpeg/
File: src/libavcodec/x86/hpeldsp_init.c
Date: 2025-10-10 03:51:19
Exec Total Coverage
Lines: 64 64 100.0%
Functions: 9 9 100.0%
Branches: 10 10 100.0%

Line Branch Exec Source
1 /*
2 * SIMD-optimized halfpel functions
3 * Copyright (c) 2000, 2001 Fabrice Bellard
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5 *
6 * This file is part of FFmpeg.
7 *
8 * FFmpeg is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
12 *
13 * FFmpeg is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * Lesser General Public License for more details.
17 *
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with FFmpeg; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 *
22 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
23 */
24
25 #include <stddef.h>
26 #include <stdint.h>
27
28 #include "libavutil/attributes.h"
29 #include "libavutil/cpu.h"
30 #include "libavutil/x86/cpu.h"
31 #include "libavcodec/avcodec.h"
32 #include "libavcodec/hpeldsp.h"
33 #include "libavcodec/pixels.h"
34 #include "fpel.h"
35 #include "hpeldsp.h"
36 #include "inline_asm.h"
37
38 void ff_put_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels,
39 ptrdiff_t line_size, int h);
40 void ff_put_pixels16_x2_sse2(uint8_t *block, const uint8_t *pixels,
41 ptrdiff_t line_size, int h);
42 void ff_avg_pixels16_x2_sse2(uint8_t *block, const uint8_t *pixels,
43 ptrdiff_t line_size, int h);
44 void ff_put_pixels16_y2_sse2(uint8_t *block, const uint8_t *pixels,
45 ptrdiff_t line_size, int h);
46 void ff_avg_pixels16_y2_sse2(uint8_t *block, const uint8_t *pixels,
47 ptrdiff_t line_size, int h);
48 void ff_put_no_rnd_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels,
49 ptrdiff_t line_size, int h);
50 void ff_put_no_rnd_pixels8_x2_exact_mmxext(uint8_t *block,
51 const uint8_t *pixels,
52 ptrdiff_t line_size, int h);
53 void ff_put_no_rnd_pixels16_x2_sse2(uint8_t *block, const uint8_t *pixels,
54 ptrdiff_t line_size, int h);
55 void ff_avg_no_rnd_pixels16_x2_sse2(uint8_t *block, const uint8_t *pixels,
56 ptrdiff_t line_size, int h);
57 void ff_put_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels,
58 ptrdiff_t line_size, int h);
59 void ff_put_no_rnd_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels,
60 ptrdiff_t line_size, int h);
61 void ff_put_no_rnd_pixels8_y2_exact_mmxext(uint8_t *block,
62 const uint8_t *pixels,
63 ptrdiff_t line_size, int h);
64 void ff_put_no_rnd_pixels16_y2_sse2(uint8_t *block, const uint8_t *pixels,
65 ptrdiff_t line_size, int h);
66 void ff_avg_no_rnd_pixels16_y2_sse2(uint8_t *block, const uint8_t *pixels,
67 ptrdiff_t line_size, int h);
68 void ff_avg_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels,
69 ptrdiff_t line_size, int h);
70 void ff_avg_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels,
71 ptrdiff_t line_size, int h);
72
73 #if HAVE_INLINE_ASM
74
75 /***********************************/
76 /* MMX no rounding */
77
78 // put_pixels
79 72222 static void put_no_rnd_pixels8_xy2_mmx(uint8_t *block, const uint8_t *pixels,
80 ptrdiff_t line_size, int h)
81 {
82 72222 MOVQ_ZERO(mm7);
83 72222 MOVQ_WONE(mm6); // =1 for no_rnd version
84 72222 __asm__ volatile(
85 "movq (%1), %%mm0 \n\t"
86 "movq 1(%1), %%mm4 \n\t"
87 "movq %%mm0, %%mm1 \n\t"
88 "movq %%mm4, %%mm5 \n\t"
89 "punpcklbw %%mm7, %%mm0 \n\t"
90 "punpcklbw %%mm7, %%mm4 \n\t"
91 "punpckhbw %%mm7, %%mm1 \n\t"
92 "punpckhbw %%mm7, %%mm5 \n\t"
93 "paddusw %%mm0, %%mm4 \n\t"
94 "paddusw %%mm1, %%mm5 \n\t"
95 "xor %%"FF_REG_a", %%"FF_REG_a" \n\t"
96 "add %3, %1 \n\t"
97 ".p2align 3 \n\t"
98 "1: \n\t"
99 "movq (%1, %%"FF_REG_a"), %%mm0 \n\t"
100 "movq 1(%1, %%"FF_REG_a"), %%mm2 \n\t"
101 "movq %%mm0, %%mm1 \n\t"
102 "movq %%mm2, %%mm3 \n\t"
103 "punpcklbw %%mm7, %%mm0 \n\t"
104 "punpcklbw %%mm7, %%mm2 \n\t"
105 "punpckhbw %%mm7, %%mm1 \n\t"
106 "punpckhbw %%mm7, %%mm3 \n\t"
107 "paddusw %%mm2, %%mm0 \n\t"
108 "paddusw %%mm3, %%mm1 \n\t"
109 "paddusw %%mm6, %%mm4 \n\t"
110 "paddusw %%mm6, %%mm5 \n\t"
111 "paddusw %%mm0, %%mm4 \n\t"
112 "paddusw %%mm1, %%mm5 \n\t"
113 "psrlw $2, %%mm4 \n\t"
114 "psrlw $2, %%mm5 \n\t"
115 "packuswb %%mm5, %%mm4 \n\t"
116 "movq %%mm4, (%2, %%"FF_REG_a") \n\t"
117 "add %3, %%"FF_REG_a" \n\t"
118
119 "movq (%1, %%"FF_REG_a"), %%mm2 \n\t" // 0 <-> 2 1 <-> 3
120 "movq 1(%1, %%"FF_REG_a"), %%mm4 \n\t"
121 "movq %%mm2, %%mm3 \n\t"
122 "movq %%mm4, %%mm5 \n\t"
123 "punpcklbw %%mm7, %%mm2 \n\t"
124 "punpcklbw %%mm7, %%mm4 \n\t"
125 "punpckhbw %%mm7, %%mm3 \n\t"
126 "punpckhbw %%mm7, %%mm5 \n\t"
127 "paddusw %%mm2, %%mm4 \n\t"
128 "paddusw %%mm3, %%mm5 \n\t"
129 "paddusw %%mm6, %%mm0 \n\t"
130 "paddusw %%mm6, %%mm1 \n\t"
131 "paddusw %%mm4, %%mm0 \n\t"
132 "paddusw %%mm5, %%mm1 \n\t"
133 "psrlw $2, %%mm0 \n\t"
134 "psrlw $2, %%mm1 \n\t"
135 "packuswb %%mm1, %%mm0 \n\t"
136 "movq %%mm0, (%2, %%"FF_REG_a") \n\t"
137 "add %3, %%"FF_REG_a" \n\t"
138
139 "subl $2, %0 \n\t"
140 "jnz 1b \n\t"
141 :"+g"(h), "+S"(pixels)
142 :"D"(block), "r"((x86_reg)line_size)
143 :FF_REG_a, "memory");
144 72222 }
145
146 // this routine is 'slightly' suboptimal but mostly unused
147 2 static void avg_no_rnd_pixels8_xy2_mmx(uint8_t *block, const uint8_t *pixels,
148 ptrdiff_t line_size, int h)
149 {
150 2 MOVQ_ZERO(mm7);
151 2 MOVQ_WONE(mm6); // =2 for rnd and =1 for no_rnd version
152 2 __asm__ volatile(
153 "movq (%1), %%mm0 \n\t"
154 "movq 1(%1), %%mm4 \n\t"
155 "movq %%mm0, %%mm1 \n\t"
156 "movq %%mm4, %%mm5 \n\t"
157 "punpcklbw %%mm7, %%mm0 \n\t"
158 "punpcklbw %%mm7, %%mm4 \n\t"
159 "punpckhbw %%mm7, %%mm1 \n\t"
160 "punpckhbw %%mm7, %%mm5 \n\t"
161 "paddusw %%mm0, %%mm4 \n\t"
162 "paddusw %%mm1, %%mm5 \n\t"
163 "xor %%"FF_REG_a", %%"FF_REG_a" \n\t"
164 "add %3, %1 \n\t"
165 ".p2align 3 \n\t"
166 "1: \n\t"
167 "movq (%1, %%"FF_REG_a"), %%mm0 \n\t"
168 "movq 1(%1, %%"FF_REG_a"), %%mm2 \n\t"
169 "movq %%mm0, %%mm1 \n\t"
170 "movq %%mm2, %%mm3 \n\t"
171 "punpcklbw %%mm7, %%mm0 \n\t"
172 "punpcklbw %%mm7, %%mm2 \n\t"
173 "punpckhbw %%mm7, %%mm1 \n\t"
174 "punpckhbw %%mm7, %%mm3 \n\t"
175 "paddusw %%mm2, %%mm0 \n\t"
176 "paddusw %%mm3, %%mm1 \n\t"
177 "paddusw %%mm6, %%mm4 \n\t"
178 "paddusw %%mm6, %%mm5 \n\t"
179 "paddusw %%mm0, %%mm4 \n\t"
180 "paddusw %%mm1, %%mm5 \n\t"
181 "psrlw $2, %%mm4 \n\t"
182 "psrlw $2, %%mm5 \n\t"
183 "movq (%2, %%"FF_REG_a"), %%mm3 \n\t"
184 "packuswb %%mm5, %%mm4 \n\t"
185 "pcmpeqd %%mm2, %%mm2 \n\t"
186 "paddb %%mm2, %%mm2 \n\t"
187 PAVGB_MMX(%%mm3, %%mm4, %%mm5, %%mm2)
188 "movq %%mm5, (%2, %%"FF_REG_a") \n\t"
189 "add %3, %%"FF_REG_a" \n\t"
190
191 "movq (%1, %%"FF_REG_a"), %%mm2 \n\t" // 0 <-> 2 1 <-> 3
192 "movq 1(%1, %%"FF_REG_a"), %%mm4 \n\t"
193 "movq %%mm2, %%mm3 \n\t"
194 "movq %%mm4, %%mm5 \n\t"
195 "punpcklbw %%mm7, %%mm2 \n\t"
196 "punpcklbw %%mm7, %%mm4 \n\t"
197 "punpckhbw %%mm7, %%mm3 \n\t"
198 "punpckhbw %%mm7, %%mm5 \n\t"
199 "paddusw %%mm2, %%mm4 \n\t"
200 "paddusw %%mm3, %%mm5 \n\t"
201 "paddusw %%mm6, %%mm0 \n\t"
202 "paddusw %%mm6, %%mm1 \n\t"
203 "paddusw %%mm4, %%mm0 \n\t"
204 "paddusw %%mm5, %%mm1 \n\t"
205 "psrlw $2, %%mm0 \n\t"
206 "psrlw $2, %%mm1 \n\t"
207 "movq (%2, %%"FF_REG_a"), %%mm3 \n\t"
208 "packuswb %%mm1, %%mm0 \n\t"
209 "pcmpeqd %%mm2, %%mm2 \n\t"
210 "paddb %%mm2, %%mm2 \n\t"
211 PAVGB_MMX(%%mm3, %%mm0, %%mm1, %%mm2)
212 "movq %%mm1, (%2, %%"FF_REG_a") \n\t"
213 "add %3, %%"FF_REG_a" \n\t"
214
215 "subl $2, %0 \n\t"
216 "jnz 1b \n\t"
217 :"+g"(h), "+S"(pixels)
218 :"D"(block), "r"((x86_reg)line_size)
219 :FF_REG_a, "memory");
220 2 }
221
222 #if HAVE_MMX
223 1 CALL_2X_PIXELS(avg_no_rnd_pixels16_xy2_mmx, avg_no_rnd_pixels8_xy2_mmx, 8)
224 10119 CALL_2X_PIXELS(put_no_rnd_pixels16_xy2_mmx, put_no_rnd_pixels8_xy2_mmx, 8)
225 #endif
226 #endif /* HAVE_INLINE_ASM */
227
228 171 static void hpeldsp_init_mmx(HpelDSPContext *c, int flags)
229 {
230 #if HAVE_MMX_INLINE
231 171 c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_mmx;
232 171 c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_mmx;
233 171 c->avg_no_rnd_pixels_tab[3] = avg_no_rnd_pixels16_xy2_mmx;
234 #endif
235 #if HAVE_MMX_EXTERNAL
236 171 c->put_no_rnd_pixels_tab[1][0] =
237 171 c->put_pixels_tab[1][0] = ff_put_pixels8_mmx;
238 #endif
239 171 }
240
241 170 static void hpeldsp_init_mmxext(HpelDSPContext *c, int flags)
242 {
243 #if HAVE_MMXEXT_EXTERNAL
244 170 c->put_pixels_tab[1][1] = ff_put_pixels8_x2_mmxext;
245 170 c->put_pixels_tab[1][2] = ff_put_pixels8_y2_mmxext;
246
247 170 c->avg_pixels_tab[1][0] = ff_avg_pixels8_mmxext;
248 170 c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_mmxext;
249 170 c->avg_pixels_tab[1][2] = ff_avg_pixels8_y2_mmxext;
250
251 170 c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_exact_mmxext;
252 170 c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_exact_mmxext;
253
254
2/2
✓ Branch 0 taken 142 times.
✓ Branch 1 taken 28 times.
170 if (!(flags & AV_CODEC_FLAG_BITEXACT)) {
255 142 c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_mmxext;
256 142 c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_mmxext;
257 }
258 #endif /* HAVE_MMXEXT_EXTERNAL */
259 170 }
260
261 168 static void hpeldsp_init_sse2(HpelDSPContext *c, int flags)
262 {
263 #if HAVE_SSE2_EXTERNAL
264 168 c->put_pixels_tab[0][0] = ff_put_pixels16_sse2;
265 168 c->put_pixels_tab[0][1] = ff_put_pixels16_x2_sse2;
266 168 c->put_pixels_tab[0][2] = ff_put_pixels16_y2_sse2;
267 168 c->put_pixels_tab[0][3] = ff_put_pixels16_xy2_sse2;
268
269 168 c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_sse2;
270 168 c->put_no_rnd_pixels_tab[0][1] = ff_put_no_rnd_pixels16_x2_sse2;
271 168 c->put_no_rnd_pixels_tab[0][2] = ff_put_no_rnd_pixels16_y2_sse2;
272
273 168 c->avg_pixels_tab[0][0] = ff_avg_pixels16_sse2;
274 168 c->avg_pixels_tab[0][1] = ff_avg_pixels16_x2_sse2;
275 168 c->avg_pixels_tab[0][2] = ff_avg_pixels16_y2_sse2;
276 168 c->avg_pixels_tab[0][3] = ff_avg_pixels16_xy2_sse2;
277
278 168 c->avg_no_rnd_pixels_tab[0] = ff_avg_pixels16_sse2;
279 168 c->avg_no_rnd_pixels_tab[1] = ff_avg_no_rnd_pixels16_x2_sse2;
280 168 c->avg_no_rnd_pixels_tab[2] = ff_avg_no_rnd_pixels16_y2_sse2;
281 #endif /* HAVE_SSE2_EXTERNAL */
282 168 }
283
284 166 static void hpeldsp_init_ssse3(HpelDSPContext *c, int flags)
285 {
286 #if HAVE_SSSE3_EXTERNAL
287 166 c->put_pixels_tab[0][3] = ff_put_pixels16_xy2_ssse3;
288 166 c->avg_pixels_tab[0][3] = ff_avg_pixels16_xy2_ssse3;
289 166 c->put_pixels_tab[1][3] = ff_put_pixels8_xy2_ssse3;
290 166 c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_ssse3;
291 #endif
292 166 }
293
294 941 av_cold void ff_hpeldsp_init_x86(HpelDSPContext *c, int flags)
295 {
296 941 int cpu_flags = av_get_cpu_flags();
297
298
2/2
✓ Branch 0 taken 171 times.
✓ Branch 1 taken 770 times.
941 if (INLINE_MMX(cpu_flags))
299 171 hpeldsp_init_mmx(c, flags);
300
301
2/2
✓ Branch 0 taken 170 times.
✓ Branch 1 taken 771 times.
941 if (EXTERNAL_MMXEXT(cpu_flags))
302 170 hpeldsp_init_mmxext(c, flags);
303
304
2/2
✓ Branch 0 taken 168 times.
✓ Branch 1 taken 773 times.
941 if (EXTERNAL_SSE2(cpu_flags))
305 168 hpeldsp_init_sse2(c, flags);
306
307
2/2
✓ Branch 0 taken 166 times.
✓ Branch 1 taken 775 times.
941 if (EXTERNAL_SSSE3(cpu_flags))
308 166 hpeldsp_init_ssse3(c, flags);
309 941 }
310