Line | Branch | Exec | Source |
---|---|---|---|
1 | /* | ||
2 | * This file is part of FFmpeg. | ||
3 | * | ||
4 | * FFmpeg is free software; you can redistribute it and/or | ||
5 | * modify it under the terms of the GNU Lesser General Public | ||
6 | * License as published by the Free Software Foundation; either | ||
7 | * version 2.1 of the License, or (at your option) any later version. | ||
8 | * | ||
9 | * FFmpeg is distributed in the hope that it will be useful, | ||
10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
12 | * Lesser General Public License for more details. | ||
13 | * | ||
14 | * You should have received a copy of the GNU Lesser General Public | ||
15 | * License along with FFmpeg; if not, write to the Free Software | ||
16 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | ||
17 | */ | ||
18 | |||
19 | #include <stdint.h> | ||
20 | |||
21 | #include "libavutil/attributes.h" | ||
22 | #include "libavutil/avassert.h" | ||
23 | #include "libavutil/common.h" | ||
24 | #include "libavutil/cpu.h" | ||
25 | #include "libavutil/x86/asm.h" | ||
26 | #include "libavutil/x86/cpu.h" | ||
27 | #include "libavcodec/avcodec.h" | ||
28 | #include "libavcodec/mpegvideoencdsp.h" | ||
29 | |||
30 | int ff_pix_sum16_sse2(const uint8_t *pix, ptrdiff_t line_size); | ||
31 | int ff_pix_sum16_xop(const uint8_t *pix, ptrdiff_t line_size); | ||
32 | int ff_pix_norm1_sse2(const uint8_t *pix, ptrdiff_t line_size); | ||
33 | |||
34 | #if HAVE_INLINE_ASM | ||
35 | #if HAVE_SSSE3_INLINE | ||
36 | #define SCALE_OFFSET -1 | ||
37 | |||
38 | #define MAX_ABS 512 | ||
39 | |||
40 | ✗ | static int try_8x8basis_ssse3(const int16_t rem[64], const int16_t weight[64], const int16_t basis[64], int scale) | |
41 | { | ||
42 | ✗ | x86_reg i=0; | |
43 | |||
44 | av_assert2(FFABS(scale) < MAX_ABS); | ||
45 | ✗ | scale <<= 16 + SCALE_OFFSET - BASIS_SHIFT + RECON_SHIFT; | |
46 | |||
47 | ✗ | __asm__ volatile( | |
48 | "pxor %%xmm2, %%xmm2 \n\t" | ||
49 | "movd %4, %%xmm3 \n\t" | ||
50 | "punpcklwd %%xmm3, %%xmm3 \n\t" | ||
51 | "pshufd $0, %%xmm3, %%xmm3 \n\t" | ||
52 | ".p2align 4 \n\t" | ||
53 | "1: \n\t" | ||
54 | "movdqa (%1, %0), %%xmm0 \n\t" | ||
55 | "movdqa 16(%1, %0), %%xmm1 \n\t" | ||
56 | "pmulhrsw %%xmm3, %%xmm0 \n\t" | ||
57 | "pmulhrsw %%xmm3, %%xmm1 \n\t" | ||
58 | "paddw (%2, %0), %%xmm0 \n\t" | ||
59 | "paddw 16(%2, %0), %%xmm1 \n\t" | ||
60 | "psraw $6, %%xmm0 \n\t" | ||
61 | "psraw $6, %%xmm1 \n\t" | ||
62 | "pmullw (%3, %0), %%xmm0 \n\t" | ||
63 | "pmullw 16(%3, %0), %%xmm1 \n\t" | ||
64 | "pmaddwd %%xmm0, %%xmm0 \n\t" | ||
65 | "pmaddwd %%xmm1, %%xmm1 \n\t" | ||
66 | "paddd %%xmm1, %%xmm0 \n\t" | ||
67 | "psrld $4, %%xmm0 \n\t" | ||
68 | "paddd %%xmm0, %%xmm2 \n\t" | ||
69 | "add $32, %0 \n\t" | ||
70 | "cmp $128, %0 \n\t" //FIXME optimize & bench | ||
71 | " jb 1b \n\t" | ||
72 | "pshufd $0x0E, %%xmm2, %%xmm0 \n\t" | ||
73 | "paddd %%xmm0, %%xmm2 \n\t" | ||
74 | "pshufd $0x01, %%xmm2, %%xmm0 \n\t" | ||
75 | "paddd %%xmm0, %%xmm2 \n\t" | ||
76 | "psrld $2, %%xmm2 \n\t" | ||
77 | "movd %%xmm2, %0 \n\t" | ||
78 | : "+r" (i) | ||
79 | : "r"(basis), "r"(rem), "r"(weight), "g"(scale) | ||
80 | XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3") | ||
81 | ); | ||
82 | ✗ | return i; | |
83 | } | ||
84 | |||
85 | 1 | static void add_8x8basis_ssse3(int16_t rem[64], const int16_t basis[64], int scale) | |
86 | { | ||
87 | 1 | x86_reg i=0; | |
88 | |||
89 |
2/4✓ Branch 0 taken 1 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 1 times.
✗ Branch 3 not taken.
|
1 | if (FFABS(scale) < 1024) { |
90 | 1 | scale <<= 16 + SCALE_OFFSET - BASIS_SHIFT + RECON_SHIFT; | |
91 | 1 | __asm__ volatile( | |
92 | "movd %3, %%xmm2 \n\t" | ||
93 | "punpcklwd %%xmm2, %%xmm2 \n\t" | ||
94 | "pshufd $0, %%xmm2, %%xmm2 \n\t" | ||
95 | ".p2align 4 \n\t" | ||
96 | "1: \n\t" | ||
97 | "movdqa (%1, %0), %%xmm0 \n\t" | ||
98 | "movdqa 16(%1, %0), %%xmm1 \n\t" | ||
99 | "pmulhrsw %%xmm2, %%xmm0 \n\t" | ||
100 | "pmulhrsw %%xmm2, %%xmm1 \n\t" | ||
101 | "paddw (%2, %0), %%xmm0 \n\t" | ||
102 | "paddw 16(%2, %0), %%xmm1 \n\t" | ||
103 | "movdqa %%xmm0, (%2, %0) \n\t" | ||
104 | "movdqa %%xmm1, 16(%2, %0) \n\t" | ||
105 | "add $32, %0 \n\t" | ||
106 | "cmp $128, %0 \n\t" // FIXME optimize & bench | ||
107 | " jb 1b \n\t" | ||
108 | : "+r" (i) | ||
109 | : "r"(basis), "r"(rem), "g"(scale) | ||
110 | XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2") | ||
111 | ); | ||
112 | } else { | ||
113 | ✗ | for (i=0; i<8*8; i++) { | |
114 | ✗ | rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT); | |
115 | } | ||
116 | } | ||
117 | 1 | } | |
118 | |||
119 | #endif /* HAVE_SSSE3_INLINE */ | ||
120 | |||
121 | /* Draw the edges of width 'w' of an image of size width, height */ | ||
122 | 165 | static void draw_edges_mmx(uint8_t *buf, ptrdiff_t wrap, int width, int height, | |
123 | int w, int h, int sides) | ||
124 | { | ||
125 | uint8_t *ptr, *last_line; | ||
126 | int i; | ||
127 | |||
128 | /* left and right */ | ||
129 | 165 | ptr = buf; | |
130 |
2/2✓ Branch 0 taken 105 times.
✓ Branch 1 taken 60 times.
|
165 | if (w == 8) { |
131 | 105 | __asm__ volatile ( | |
132 | "1: \n\t" | ||
133 | "movd (%0), %%mm0 \n\t" | ||
134 | "punpcklbw %%mm0, %%mm0 \n\t" | ||
135 | "punpcklwd %%mm0, %%mm0 \n\t" | ||
136 | "punpckldq %%mm0, %%mm0 \n\t" | ||
137 | "movq %%mm0, -8(%0) \n\t" | ||
138 | "movq -8(%0, %2), %%mm1 \n\t" | ||
139 | "punpckhbw %%mm1, %%mm1 \n\t" | ||
140 | "punpckhwd %%mm1, %%mm1 \n\t" | ||
141 | "punpckhdq %%mm1, %%mm1 \n\t" | ||
142 | "movq %%mm1, (%0, %2) \n\t" | ||
143 | "add %1, %0 \n\t" | ||
144 | "cmp %3, %0 \n\t" | ||
145 | "jnz 1b \n\t" | ||
146 | : "+r" (ptr) | ||
147 | 105 | : "r" ((x86_reg) wrap), "r" ((x86_reg) width), | |
148 | 105 | "r" (ptr + wrap * height)); | |
149 |
2/2✓ Branch 0 taken 55 times.
✓ Branch 1 taken 5 times.
|
60 | } else if (w == 16) { |
150 | 55 | __asm__ volatile ( | |
151 | "1: \n\t" | ||
152 | "movd (%0), %%mm0 \n\t" | ||
153 | "punpcklbw %%mm0, %%mm0 \n\t" | ||
154 | "punpcklwd %%mm0, %%mm0 \n\t" | ||
155 | "punpckldq %%mm0, %%mm0 \n\t" | ||
156 | "movq %%mm0, -8(%0) \n\t" | ||
157 | "movq %%mm0, -16(%0) \n\t" | ||
158 | "movq -8(%0, %2), %%mm1 \n\t" | ||
159 | "punpckhbw %%mm1, %%mm1 \n\t" | ||
160 | "punpckhwd %%mm1, %%mm1 \n\t" | ||
161 | "punpckhdq %%mm1, %%mm1 \n\t" | ||
162 | "movq %%mm1, (%0, %2) \n\t" | ||
163 | "movq %%mm1, 8(%0, %2) \n\t" | ||
164 | "add %1, %0 \n\t" | ||
165 | "cmp %3, %0 \n\t" | ||
166 | "jnz 1b \n\t" | ||
167 | : "+r"(ptr) | ||
168 | 55 | : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height) | |
169 | ); | ||
170 | } else { | ||
171 | av_assert1(w == 4); | ||
172 | 5 | __asm__ volatile ( | |
173 | "1: \n\t" | ||
174 | "movd (%0), %%mm0 \n\t" | ||
175 | "punpcklbw %%mm0, %%mm0 \n\t" | ||
176 | "punpcklwd %%mm0, %%mm0 \n\t" | ||
177 | "movd %%mm0, -4(%0) \n\t" | ||
178 | "movd -4(%0, %2), %%mm1 \n\t" | ||
179 | "punpcklbw %%mm1, %%mm1 \n\t" | ||
180 | "punpckhwd %%mm1, %%mm1 \n\t" | ||
181 | "punpckhdq %%mm1, %%mm1 \n\t" | ||
182 | "movd %%mm1, (%0, %2) \n\t" | ||
183 | "add %1, %0 \n\t" | ||
184 | "cmp %3, %0 \n\t" | ||
185 | "jnz 1b \n\t" | ||
186 | : "+r" (ptr) | ||
187 | 5 | : "r" ((x86_reg) wrap), "r" ((x86_reg) width), | |
188 | 5 | "r" (ptr + wrap * height)); | |
189 | } | ||
190 | |||
191 | /* top and bottom + corners */ | ||
192 | 165 | buf -= w; | |
193 | 165 | last_line = buf + (height - 1) * wrap; | |
194 |
1/2✓ Branch 0 taken 165 times.
✗ Branch 1 not taken.
|
165 | if (sides & EDGE_TOP) |
195 |
2/2✓ Branch 0 taken 1740 times.
✓ Branch 1 taken 165 times.
|
1905 | for (i = 0; i < h; i++) |
196 | // top | ||
197 | 1740 | memcpy(buf - (i + 1) * wrap, buf, width + w + w); | |
198 |
1/2✓ Branch 0 taken 165 times.
✗ Branch 1 not taken.
|
165 | if (sides & EDGE_BOTTOM) |
199 |
2/2✓ Branch 0 taken 1740 times.
✓ Branch 1 taken 165 times.
|
1905 | for (i = 0; i < h; i++) |
200 | // bottom | ||
201 | 1740 | memcpy(last_line + (i + 1) * wrap, last_line, width + w + w); | |
202 | 165 | } | |
203 | |||
204 | #endif /* HAVE_INLINE_ASM */ | ||
205 | |||
206 | 377 | av_cold void ff_mpegvideoencdsp_init_x86(MpegvideoEncDSPContext *c, | |
207 | AVCodecContext *avctx) | ||
208 | { | ||
209 | 377 | int cpu_flags = av_get_cpu_flags(); | |
210 | |||
211 |
2/2✓ Branch 0 taken 10 times.
✓ Branch 1 taken 367 times.
|
377 | if (EXTERNAL_SSE2(cpu_flags)) { |
212 | 10 | c->pix_sum = ff_pix_sum16_sse2; | |
213 | 10 | c->pix_norm1 = ff_pix_norm1_sse2; | |
214 | } | ||
215 | |||
216 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 377 times.
|
377 | if (EXTERNAL_XOP(cpu_flags)) { |
217 | ✗ | c->pix_sum = ff_pix_sum16_xop; | |
218 | } | ||
219 | |||
220 | #if HAVE_INLINE_ASM | ||
221 | |||
222 |
2/2✓ Branch 0 taken 13 times.
✓ Branch 1 taken 364 times.
|
377 | if (INLINE_MMX(cpu_flags)) { |
223 |
1/2✓ Branch 0 taken 13 times.
✗ Branch 1 not taken.
|
13 | if (avctx->bits_per_raw_sample <= 8) { |
224 | 13 | c->draw_edges = draw_edges_mmx; | |
225 | } | ||
226 | } | ||
227 | |||
228 | #if HAVE_SSSE3_INLINE | ||
229 |
2/2✓ Branch 0 taken 8 times.
✓ Branch 1 taken 369 times.
|
377 | if (INLINE_SSSE3(cpu_flags)) { |
230 |
2/2✓ Branch 0 taken 7 times.
✓ Branch 1 taken 1 times.
|
8 | if (!(avctx->flags & AV_CODEC_FLAG_BITEXACT)) { |
231 | 7 | c->try_8x8basis = try_8x8basis_ssse3; | |
232 | } | ||
233 | 8 | c->add_8x8basis = add_8x8basis_ssse3; | |
234 | } | ||
235 | #endif /* HAVE_SSSE3_INLINE */ | ||
236 | |||
237 | #endif /* HAVE_INLINE_ASM */ | ||
238 | 377 | } | |
239 |