Line | Branch | Exec | Source |
---|---|---|---|
1 | /* | ||
2 | * This file is part of FFmpeg. | ||
3 | * | ||
4 | * FFmpeg is free software; you can redistribute it and/or | ||
5 | * modify it under the terms of the GNU Lesser General Public | ||
6 | * License as published by the Free Software Foundation; either | ||
7 | * version 2.1 of the License, or (at your option) any later version. | ||
8 | * | ||
9 | * FFmpeg is distributed in the hope that it will be useful, | ||
10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
12 | * Lesser General Public License for more details. | ||
13 | * | ||
14 | * You should have received a copy of the GNU Lesser General Public | ||
15 | * License along with FFmpeg; if not, write to the Free Software | ||
16 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | ||
17 | */ | ||
18 | |||
19 | #include "libavutil/attributes.h" | ||
20 | #include "libavutil/avassert.h" | ||
21 | #include "libavutil/cpu.h" | ||
22 | #include "libavutil/x86/cpu.h" | ||
23 | #include "libavcodec/avcodec.h" | ||
24 | #include "libavcodec/mpegvideoencdsp.h" | ||
25 | |||
26 | int ff_pix_sum16_sse2(const uint8_t *pix, int line_size); | ||
27 | int ff_pix_sum16_xop(const uint8_t *pix, int line_size); | ||
28 | int ff_pix_norm1_sse2(const uint8_t *pix, int line_size); | ||
29 | |||
30 | #if HAVE_INLINE_ASM | ||
31 | |||
32 | #define PHADDD(a, t) \ | ||
33 | "movq " #a ", " #t " \n\t" \ | ||
34 | "psrlq $32, " #a " \n\t" \ | ||
35 | "paddd " #t ", " #a " \n\t" | ||
36 | |||
37 | /* | ||
38 | * pmulhw: dst[0 - 15] = (src[0 - 15] * dst[0 - 15])[16 - 31] | ||
39 | * pmulhrw: dst[0 - 15] = (src[0 - 15] * dst[0 - 15] + 0x8000)[16 - 31] | ||
40 | * pmulhrsw: dst[0 - 15] = (src[0 - 15] * dst[0 - 15] + 0x4000)[15 - 30] | ||
41 | */ | ||
42 | #define PMULHRW(x, y, s, o) \ | ||
43 | "pmulhw " #s ", " #x " \n\t" \ | ||
44 | "pmulhw " #s ", " #y " \n\t" \ | ||
45 | "paddw " #o ", " #x " \n\t" \ | ||
46 | "paddw " #o ", " #y " \n\t" \ | ||
47 | "psraw $1, " #x " \n\t" \ | ||
48 | "psraw $1, " #y " \n\t" | ||
49 | #define DEF(x) x ## _mmx | ||
50 | #define SET_RND MOVQ_WONE | ||
51 | #define SCALE_OFFSET 1 | ||
52 | |||
53 | #include "mpegvideoenc_qns_template.c" | ||
54 | |||
55 | #undef DEF | ||
56 | #undef SET_RND | ||
57 | #undef SCALE_OFFSET | ||
58 | #undef PMULHRW | ||
59 | |||
60 | #define DEF(x) x ## _3dnow | ||
61 | #define SET_RND(x) | ||
62 | #define SCALE_OFFSET 0 | ||
63 | #define PMULHRW(x, y, s, o) \ | ||
64 | "pmulhrw " #s ", " #x " \n\t" \ | ||
65 | "pmulhrw " #s ", " #y " \n\t" | ||
66 | |||
67 | #include "mpegvideoenc_qns_template.c" | ||
68 | |||
69 | #undef DEF | ||
70 | #undef SET_RND | ||
71 | #undef SCALE_OFFSET | ||
72 | #undef PMULHRW | ||
73 | |||
74 | #if HAVE_SSSE3_INLINE | ||
75 | #undef PHADDD | ||
76 | #define DEF(x) x ## _ssse3 | ||
77 | #define SET_RND(x) | ||
78 | #define SCALE_OFFSET -1 | ||
79 | |||
80 | #define PHADDD(a, t) \ | ||
81 | "pshufw $0x0E, " #a ", " #t " \n\t" \ | ||
82 | /* faster than phaddd on core2 */ \ | ||
83 | "paddd " #t ", " #a " \n\t" | ||
84 | |||
85 | #define PMULHRW(x, y, s, o) \ | ||
86 | "pmulhrsw " #s ", " #x " \n\t" \ | ||
87 | "pmulhrsw " #s ", " #y " \n\t" | ||
88 | |||
89 | #include "mpegvideoenc_qns_template.c" | ||
90 | |||
91 | #undef DEF | ||
92 | #undef SET_RND | ||
93 | #undef SCALE_OFFSET | ||
94 | #undef PMULHRW | ||
95 | #undef PHADDD | ||
96 | #endif /* HAVE_SSSE3_INLINE */ | ||
97 | |||
98 | /* Draw the edges of width 'w' of an image of size width, height | ||
99 | * this MMX version can only handle w == 8 || w == 16. */ | ||
100 | 150 | static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height, | |
101 | int w, int h, int sides) | ||
102 | { | ||
103 | uint8_t *ptr, *last_line; | ||
104 | int i; | ||
105 | |||
106 | 150 | last_line = buf + (height - 1) * wrap; | |
107 | /* left and right */ | ||
108 | 150 | ptr = buf; | |
109 |
2/2✓ Branch 0 taken 100 times.
✓ Branch 1 taken 50 times.
|
150 | if (w == 8) { |
110 | 100 | __asm__ volatile ( | |
111 | "1: \n\t" | ||
112 | "movd (%0), %%mm0 \n\t" | ||
113 | "punpcklbw %%mm0, %%mm0 \n\t" | ||
114 | "punpcklwd %%mm0, %%mm0 \n\t" | ||
115 | "punpckldq %%mm0, %%mm0 \n\t" | ||
116 | "movq %%mm0, -8(%0) \n\t" | ||
117 | "movq -8(%0, %2), %%mm1 \n\t" | ||
118 | "punpckhbw %%mm1, %%mm1 \n\t" | ||
119 | "punpckhwd %%mm1, %%mm1 \n\t" | ||
120 | "punpckhdq %%mm1, %%mm1 \n\t" | ||
121 | "movq %%mm1, (%0, %2) \n\t" | ||
122 | "add %1, %0 \n\t" | ||
123 | "cmp %3, %0 \n\t" | ||
124 | "jb 1b \n\t" | ||
125 | : "+r" (ptr) | ||
126 | 100 | : "r" ((x86_reg) wrap), "r" ((x86_reg) width), | |
127 | 100 | "r" (ptr + wrap * height)); | |
128 |
1/2✓ Branch 0 taken 50 times.
✗ Branch 1 not taken.
|
50 | } else if (w == 16) { |
129 | 50 | __asm__ volatile ( | |
130 | "1: \n\t" | ||
131 | "movd (%0), %%mm0 \n\t" | ||
132 | "punpcklbw %%mm0, %%mm0 \n\t" | ||
133 | "punpcklwd %%mm0, %%mm0 \n\t" | ||
134 | "punpckldq %%mm0, %%mm0 \n\t" | ||
135 | "movq %%mm0, -8(%0) \n\t" | ||
136 | "movq %%mm0, -16(%0) \n\t" | ||
137 | "movq -8(%0, %2), %%mm1 \n\t" | ||
138 | "punpckhbw %%mm1, %%mm1 \n\t" | ||
139 | "punpckhwd %%mm1, %%mm1 \n\t" | ||
140 | "punpckhdq %%mm1, %%mm1 \n\t" | ||
141 | "movq %%mm1, (%0, %2) \n\t" | ||
142 | "movq %%mm1, 8(%0, %2) \n\t" | ||
143 | "add %1, %0 \n\t" | ||
144 | "cmp %3, %0 \n\t" | ||
145 | "jb 1b \n\t" | ||
146 | : "+r"(ptr) | ||
147 | 50 | : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height) | |
148 | ); | ||
149 | } else { | ||
150 | av_assert1(w == 4); | ||
151 | ✗ | __asm__ volatile ( | |
152 | "1: \n\t" | ||
153 | "movd (%0), %%mm0 \n\t" | ||
154 | "punpcklbw %%mm0, %%mm0 \n\t" | ||
155 | "punpcklwd %%mm0, %%mm0 \n\t" | ||
156 | "movd %%mm0, -4(%0) \n\t" | ||
157 | "movd -4(%0, %2), %%mm1 \n\t" | ||
158 | "punpcklbw %%mm1, %%mm1 \n\t" | ||
159 | "punpckhwd %%mm1, %%mm1 \n\t" | ||
160 | "punpckhdq %%mm1, %%mm1 \n\t" | ||
161 | "movd %%mm1, (%0, %2) \n\t" | ||
162 | "add %1, %0 \n\t" | ||
163 | "cmp %3, %0 \n\t" | ||
164 | "jb 1b \n\t" | ||
165 | : "+r" (ptr) | ||
166 | ✗ | : "r" ((x86_reg) wrap), "r" ((x86_reg) width), | |
167 | ✗ | "r" (ptr + wrap * height)); | |
168 | } | ||
169 | |||
170 | /* top and bottom (and hopefully also the corners) */ | ||
171 |
1/2✓ Branch 0 taken 150 times.
✗ Branch 1 not taken.
|
150 | if (sides & EDGE_TOP) { |
172 |
2/2✓ Branch 0 taken 400 times.
✓ Branch 1 taken 150 times.
|
550 | for (i = 0; i < h; i += 4) { |
173 | 400 | ptr = buf - (i + 1) * wrap - w; | |
174 | 400 | __asm__ volatile ( | |
175 | "1: \n\t" | ||
176 | "movq (%1, %0), %%mm0 \n\t" | ||
177 | "movq %%mm0, (%0) \n\t" | ||
178 | "movq %%mm0, (%0, %2) \n\t" | ||
179 | "movq %%mm0, (%0, %2, 2) \n\t" | ||
180 | "movq %%mm0, (%0, %3) \n\t" | ||
181 | "add $8, %0 \n\t" | ||
182 | "cmp %4, %0 \n\t" | ||
183 | "jb 1b \n\t" | ||
184 | : "+r" (ptr) | ||
185 | 400 | : "r" ((x86_reg) buf - (x86_reg) ptr - w), | |
186 | 400 | "r" ((x86_reg) - wrap), "r" ((x86_reg) - wrap * 3), | |
187 | 400 | "r" (ptr + width + 2 * w)); | |
188 | } | ||
189 | } | ||
190 | |||
191 |
1/2✓ Branch 0 taken 150 times.
✗ Branch 1 not taken.
|
150 | if (sides & EDGE_BOTTOM) { |
192 |
2/2✓ Branch 0 taken 400 times.
✓ Branch 1 taken 150 times.
|
550 | for (i = 0; i < h; i += 4) { |
193 | 400 | ptr = last_line + (i + 1) * wrap - w; | |
194 | 400 | __asm__ volatile ( | |
195 | "1: \n\t" | ||
196 | "movq (%1, %0), %%mm0 \n\t" | ||
197 | "movq %%mm0, (%0) \n\t" | ||
198 | "movq %%mm0, (%0, %2) \n\t" | ||
199 | "movq %%mm0, (%0, %2, 2) \n\t" | ||
200 | "movq %%mm0, (%0, %3) \n\t" | ||
201 | "add $8, %0 \n\t" | ||
202 | "cmp %4, %0 \n\t" | ||
203 | "jb 1b \n\t" | ||
204 | : "+r" (ptr) | ||
205 | 400 | : "r" ((x86_reg) last_line - (x86_reg) ptr - w), | |
206 | 400 | "r" ((x86_reg) wrap), "r" ((x86_reg) wrap * 3), | |
207 | 400 | "r" (ptr + width + 2 * w)); | |
208 | } | ||
209 | } | ||
210 | 150 | } | |
211 | |||
212 | #endif /* HAVE_INLINE_ASM */ | ||
213 | |||
214 | 358 | av_cold void ff_mpegvideoencdsp_init_x86(MpegvideoEncDSPContext *c, | |
215 | AVCodecContext *avctx) | ||
216 | { | ||
217 | 358 | int cpu_flags = av_get_cpu_flags(); | |
218 | |||
219 |
2/2✓ Branch 0 taken 1 times.
✓ Branch 1 taken 357 times.
|
358 | if (EXTERNAL_SSE2(cpu_flags)) { |
220 | 1 | c->pix_sum = ff_pix_sum16_sse2; | |
221 | 1 | c->pix_norm1 = ff_pix_norm1_sse2; | |
222 | } | ||
223 | |||
224 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 358 times.
|
358 | if (EXTERNAL_XOP(cpu_flags)) { |
225 | ✗ | c->pix_sum = ff_pix_sum16_xop; | |
226 | } | ||
227 | |||
228 | #if HAVE_INLINE_ASM | ||
229 | |||
230 |
2/2✓ Branch 0 taken 1 times.
✓ Branch 1 taken 357 times.
|
358 | if (INLINE_MMX(cpu_flags)) { |
231 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 1 times.
|
1 | if (!(avctx->flags & AV_CODEC_FLAG_BITEXACT)) { |
232 | ✗ | c->try_8x8basis = try_8x8basis_mmx; | |
233 | } | ||
234 | 1 | c->add_8x8basis = add_8x8basis_mmx; | |
235 | |||
236 |
1/2✓ Branch 0 taken 1 times.
✗ Branch 1 not taken.
|
1 | if (avctx->bits_per_raw_sample <= 8) { |
237 | 1 | c->draw_edges = draw_edges_mmx; | |
238 | } | ||
239 | } | ||
240 | |||
241 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 358 times.
|
358 | if (INLINE_AMD3DNOW(cpu_flags)) { |
242 | ✗ | if (!(avctx->flags & AV_CODEC_FLAG_BITEXACT)) { | |
243 | ✗ | c->try_8x8basis = try_8x8basis_3dnow; | |
244 | } | ||
245 | ✗ | c->add_8x8basis = add_8x8basis_3dnow; | |
246 | } | ||
247 | |||
248 | #if HAVE_SSSE3_INLINE | ||
249 |
2/2✓ Branch 0 taken 1 times.
✓ Branch 1 taken 357 times.
|
358 | if (INLINE_SSSE3(cpu_flags)) { |
250 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 1 times.
|
1 | if (!(avctx->flags & AV_CODEC_FLAG_BITEXACT)) { |
251 | ✗ | c->try_8x8basis = try_8x8basis_ssse3; | |
252 | } | ||
253 | 1 | c->add_8x8basis = add_8x8basis_ssse3; | |
254 | } | ||
255 | #endif /* HAVE_SSSE3_INLINE */ | ||
256 | |||
257 | #endif /* HAVE_INLINE_ASM */ | ||
258 | 358 | } | |
259 |