Line | Branch | Exec | Source |
---|---|---|---|
1 | /* | ||
2 | * This file is part of FFmpeg. | ||
3 | * | ||
4 | * FFmpeg is free software; you can redistribute it and/or | ||
5 | * modify it under the terms of the GNU Lesser General Public | ||
6 | * License as published by the Free Software Foundation; either | ||
7 | * version 2.1 of the License, or (at your option) any later version. | ||
8 | * | ||
9 | * FFmpeg is distributed in the hope that it will be useful, | ||
10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
12 | * Lesser General Public License for more details. | ||
13 | * | ||
14 | * You should have received a copy of the GNU Lesser General Public | ||
15 | * License along with FFmpeg; if not, write to the Free Software | ||
16 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | ||
17 | */ | ||
18 | |||
19 | #include "libavutil/attributes.h" | ||
20 | #include "libavutil/avassert.h" | ||
21 | #include "libavutil/cpu.h" | ||
22 | #include "libavutil/x86/cpu.h" | ||
23 | #include "libavcodec/avcodec.h" | ||
24 | #include "libavcodec/mpegvideoencdsp.h" | ||
25 | |||
26 | int ff_pix_sum16_sse2(const uint8_t *pix, ptrdiff_t line_size); | ||
27 | int ff_pix_sum16_xop(const uint8_t *pix, ptrdiff_t line_size); | ||
28 | int ff_pix_norm1_sse2(const uint8_t *pix, ptrdiff_t line_size); | ||
29 | |||
30 | #if HAVE_INLINE_ASM | ||
31 | |||
32 | #define PHADDD(a, t) \ | ||
33 | "movq " #a ", " #t " \n\t" \ | ||
34 | "psrlq $32, " #a " \n\t" \ | ||
35 | "paddd " #t ", " #a " \n\t" | ||
36 | |||
37 | /* | ||
38 | * pmulhw: dst[0 - 15] = (src[0 - 15] * dst[0 - 15])[16 - 31] | ||
39 | * pmulhrw: dst[0 - 15] = (src[0 - 15] * dst[0 - 15] + 0x8000)[16 - 31] | ||
40 | * pmulhrsw: dst[0 - 15] = (src[0 - 15] * dst[0 - 15] + 0x4000)[15 - 30] | ||
41 | */ | ||
42 | #define PMULHRW(x, y, s, o) \ | ||
43 | "pmulhw " #s ", " #x " \n\t" \ | ||
44 | "pmulhw " #s ", " #y " \n\t" \ | ||
45 | "paddw " #o ", " #x " \n\t" \ | ||
46 | "paddw " #o ", " #y " \n\t" \ | ||
47 | "psraw $1, " #x " \n\t" \ | ||
48 | "psraw $1, " #y " \n\t" | ||
49 | #define DEF(x) x ## _mmx | ||
50 | #define SET_RND MOVQ_WONE | ||
51 | #define SCALE_OFFSET 1 | ||
52 | |||
53 | #include "mpegvideoenc_qns_template.c" | ||
54 | |||
55 | #undef DEF | ||
56 | #undef SET_RND | ||
57 | #undef SCALE_OFFSET | ||
58 | #undef PMULHRW | ||
59 | |||
60 | #define DEF(x) x ## _3dnow | ||
61 | #define SET_RND(x) | ||
62 | #define SCALE_OFFSET 0 | ||
63 | #define PMULHRW(x, y, s, o) \ | ||
64 | "pmulhrw " #s ", " #x " \n\t" \ | ||
65 | "pmulhrw " #s ", " #y " \n\t" | ||
66 | |||
67 | #include "mpegvideoenc_qns_template.c" | ||
68 | |||
69 | #undef DEF | ||
70 | #undef SET_RND | ||
71 | #undef SCALE_OFFSET | ||
72 | #undef PMULHRW | ||
73 | |||
74 | #if HAVE_SSSE3_INLINE | ||
75 | #undef PHADDD | ||
76 | #define DEF(x) x ## _ssse3 | ||
77 | #define SET_RND(x) | ||
78 | #define SCALE_OFFSET -1 | ||
79 | |||
80 | #define PHADDD(a, t) \ | ||
81 | "pshufw $0x0E, " #a ", " #t " \n\t" \ | ||
82 | /* faster than phaddd on core2 */ \ | ||
83 | "paddd " #t ", " #a " \n\t" | ||
84 | |||
85 | #define PMULHRW(x, y, s, o) \ | ||
86 | "pmulhrsw " #s ", " #x " \n\t" \ | ||
87 | "pmulhrsw " #s ", " #y " \n\t" | ||
88 | |||
89 | #include "mpegvideoenc_qns_template.c" | ||
90 | |||
91 | #undef DEF | ||
92 | #undef SET_RND | ||
93 | #undef SCALE_OFFSET | ||
94 | #undef PMULHRW | ||
95 | #undef PHADDD | ||
96 | #endif /* HAVE_SSSE3_INLINE */ | ||
97 | |||
98 | /* Draw the edges of width 'w' of an image of size width, height */ | ||
99 | 165 | static void draw_edges_mmx(uint8_t *buf, ptrdiff_t wrap, int width, int height, | |
100 | int w, int h, int sides) | ||
101 | { | ||
102 | uint8_t *ptr, *last_line; | ||
103 | int i; | ||
104 | |||
105 | /* left and right */ | ||
106 | 165 | ptr = buf; | |
107 |
2/2✓ Branch 0 taken 105 times.
✓ Branch 1 taken 60 times.
|
165 | if (w == 8) { |
108 | 105 | __asm__ volatile ( | |
109 | "1: \n\t" | ||
110 | "movd (%0), %%mm0 \n\t" | ||
111 | "punpcklbw %%mm0, %%mm0 \n\t" | ||
112 | "punpcklwd %%mm0, %%mm0 \n\t" | ||
113 | "punpckldq %%mm0, %%mm0 \n\t" | ||
114 | "movq %%mm0, -8(%0) \n\t" | ||
115 | "movq -8(%0, %2), %%mm1 \n\t" | ||
116 | "punpckhbw %%mm1, %%mm1 \n\t" | ||
117 | "punpckhwd %%mm1, %%mm1 \n\t" | ||
118 | "punpckhdq %%mm1, %%mm1 \n\t" | ||
119 | "movq %%mm1, (%0, %2) \n\t" | ||
120 | "add %1, %0 \n\t" | ||
121 | "cmp %3, %0 \n\t" | ||
122 | "jnz 1b \n\t" | ||
123 | : "+r" (ptr) | ||
124 | 105 | : "r" ((x86_reg) wrap), "r" ((x86_reg) width), | |
125 | 105 | "r" (ptr + wrap * height)); | |
126 |
2/2✓ Branch 0 taken 55 times.
✓ Branch 1 taken 5 times.
|
60 | } else if (w == 16) { |
127 | 55 | __asm__ volatile ( | |
128 | "1: \n\t" | ||
129 | "movd (%0), %%mm0 \n\t" | ||
130 | "punpcklbw %%mm0, %%mm0 \n\t" | ||
131 | "punpcklwd %%mm0, %%mm0 \n\t" | ||
132 | "punpckldq %%mm0, %%mm0 \n\t" | ||
133 | "movq %%mm0, -8(%0) \n\t" | ||
134 | "movq %%mm0, -16(%0) \n\t" | ||
135 | "movq -8(%0, %2), %%mm1 \n\t" | ||
136 | "punpckhbw %%mm1, %%mm1 \n\t" | ||
137 | "punpckhwd %%mm1, %%mm1 \n\t" | ||
138 | "punpckhdq %%mm1, %%mm1 \n\t" | ||
139 | "movq %%mm1, (%0, %2) \n\t" | ||
140 | "movq %%mm1, 8(%0, %2) \n\t" | ||
141 | "add %1, %0 \n\t" | ||
142 | "cmp %3, %0 \n\t" | ||
143 | "jnz 1b \n\t" | ||
144 | : "+r"(ptr) | ||
145 | 55 | : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height) | |
146 | ); | ||
147 | } else { | ||
148 | av_assert1(w == 4); | ||
149 | 5 | __asm__ volatile ( | |
150 | "1: \n\t" | ||
151 | "movd (%0), %%mm0 \n\t" | ||
152 | "punpcklbw %%mm0, %%mm0 \n\t" | ||
153 | "punpcklwd %%mm0, %%mm0 \n\t" | ||
154 | "movd %%mm0, -4(%0) \n\t" | ||
155 | "movd -4(%0, %2), %%mm1 \n\t" | ||
156 | "punpcklbw %%mm1, %%mm1 \n\t" | ||
157 | "punpckhwd %%mm1, %%mm1 \n\t" | ||
158 | "punpckhdq %%mm1, %%mm1 \n\t" | ||
159 | "movd %%mm1, (%0, %2) \n\t" | ||
160 | "add %1, %0 \n\t" | ||
161 | "cmp %3, %0 \n\t" | ||
162 | "jnz 1b \n\t" | ||
163 | : "+r" (ptr) | ||
164 | 5 | : "r" ((x86_reg) wrap), "r" ((x86_reg) width), | |
165 | 5 | "r" (ptr + wrap * height)); | |
166 | } | ||
167 | |||
168 | /* top and bottom + corners */ | ||
169 | 165 | buf -= w; | |
170 | 165 | last_line = buf + (height - 1) * wrap; | |
171 |
1/2✓ Branch 0 taken 165 times.
✗ Branch 1 not taken.
|
165 | if (sides & EDGE_TOP) |
172 |
2/2✓ Branch 0 taken 1740 times.
✓ Branch 1 taken 165 times.
|
1905 | for (i = 0; i < h; i++) |
173 | // top | ||
174 | 1740 | memcpy(buf - (i + 1) * wrap, buf, width + w + w); | |
175 |
1/2✓ Branch 0 taken 165 times.
✗ Branch 1 not taken.
|
165 | if (sides & EDGE_BOTTOM) |
176 |
2/2✓ Branch 0 taken 1740 times.
✓ Branch 1 taken 165 times.
|
1905 | for (i = 0; i < h; i++) |
177 | // bottom | ||
178 | 1740 | memcpy(last_line + (i + 1) * wrap, last_line, width + w + w); | |
179 | 165 | } | |
180 | |||
181 | #endif /* HAVE_INLINE_ASM */ | ||
182 | |||
183 | 375 | av_cold void ff_mpegvideoencdsp_init_x86(MpegvideoEncDSPContext *c, | |
184 | AVCodecContext *avctx) | ||
185 | { | ||
186 | 375 | int cpu_flags = av_get_cpu_flags(); | |
187 | |||
188 |
2/2✓ Branch 0 taken 10 times.
✓ Branch 1 taken 365 times.
|
375 | if (EXTERNAL_SSE2(cpu_flags)) { |
189 | 10 | c->pix_sum = ff_pix_sum16_sse2; | |
190 | 10 | c->pix_norm1 = ff_pix_norm1_sse2; | |
191 | } | ||
192 | |||
193 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 375 times.
|
375 | if (EXTERNAL_XOP(cpu_flags)) { |
194 | ✗ | c->pix_sum = ff_pix_sum16_xop; | |
195 | } | ||
196 | |||
197 | #if HAVE_INLINE_ASM | ||
198 | |||
199 |
2/2✓ Branch 0 taken 13 times.
✓ Branch 1 taken 362 times.
|
375 | if (INLINE_MMX(cpu_flags)) { |
200 |
2/2✓ Branch 0 taken 12 times.
✓ Branch 1 taken 1 times.
|
13 | if (!(avctx->flags & AV_CODEC_FLAG_BITEXACT)) { |
201 | 12 | c->try_8x8basis = try_8x8basis_mmx; | |
202 | } | ||
203 | 13 | c->add_8x8basis = add_8x8basis_mmx; | |
204 | |||
205 |
1/2✓ Branch 0 taken 13 times.
✗ Branch 1 not taken.
|
13 | if (avctx->bits_per_raw_sample <= 8) { |
206 | 13 | c->draw_edges = draw_edges_mmx; | |
207 | } | ||
208 | } | ||
209 | |||
210 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 375 times.
|
375 | if (INLINE_AMD3DNOW(cpu_flags)) { |
211 | ✗ | if (!(avctx->flags & AV_CODEC_FLAG_BITEXACT)) { | |
212 | ✗ | c->try_8x8basis = try_8x8basis_3dnow; | |
213 | } | ||
214 | ✗ | c->add_8x8basis = add_8x8basis_3dnow; | |
215 | } | ||
216 | |||
217 | #if HAVE_SSSE3_INLINE | ||
218 |
2/2✓ Branch 0 taken 8 times.
✓ Branch 1 taken 367 times.
|
375 | if (INLINE_SSSE3(cpu_flags)) { |
219 |
2/2✓ Branch 0 taken 7 times.
✓ Branch 1 taken 1 times.
|
8 | if (!(avctx->flags & AV_CODEC_FLAG_BITEXACT)) { |
220 | 7 | c->try_8x8basis = try_8x8basis_ssse3; | |
221 | } | ||
222 | 8 | c->add_8x8basis = add_8x8basis_ssse3; | |
223 | } | ||
224 | #endif /* HAVE_SSSE3_INLINE */ | ||
225 | |||
226 | #endif /* HAVE_INLINE_ASM */ | ||
227 | 375 | } | |
228 |