Line | Branch | Exec | Source |
---|---|---|---|
1 | /* | ||
2 | * Optimized for ia32 CPUs by Nick Kurshev <nickols_k@mail.ru> | ||
3 | * H.263, MPEG-1, MPEG-2 dequantizer & draw_edges by Michael Niedermayer <michaelni@gmx.at> | ||
4 | * | ||
5 | * This file is part of FFmpeg. | ||
6 | * | ||
7 | * FFmpeg is free software; you can redistribute it and/or | ||
8 | * modify it under the terms of the GNU Lesser General Public | ||
9 | * License as published by the Free Software Foundation; either | ||
10 | * version 2.1 of the License, or (at your option) any later version. | ||
11 | * | ||
12 | * FFmpeg is distributed in the hope that it will be useful, | ||
13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
15 | * Lesser General Public License for more details. | ||
16 | * | ||
17 | * You should have received a copy of the GNU Lesser General Public | ||
18 | * License along with FFmpeg; if not, write to the Free Software | ||
19 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | ||
20 | */ | ||
21 | |||
22 | #include "libavutil/attributes.h" | ||
23 | #include "libavutil/cpu.h" | ||
24 | #include "libavutil/x86/asm.h" | ||
25 | #include "libavutil/x86/cpu.h" | ||
26 | #include "libavcodec/avcodec.h" | ||
27 | #include "libavcodec/mpegvideo.h" | ||
28 | #include "libavcodec/mpegvideodata.h" | ||
29 | |||
30 | #if HAVE_MMX_INLINE | ||
31 | |||
32 | 194460 | static void dct_unquantize_h263_intra_mmx(MpegEncContext *s, | |
33 | int16_t *block, int n, int qscale) | ||
34 | { | ||
35 | x86_reg level, qmul, qadd, nCoeffs; | ||
36 | |||
37 | 194460 | qmul = qscale << 1; | |
38 | |||
39 | av_assert2(s->block_last_index[n]>=0 || s->h263_aic); | ||
40 | |||
41 |
1/2✓ Branch 0 taken 194460 times.
✗ Branch 1 not taken.
|
194460 | if (!s->h263_aic) { |
42 |
2/2✓ Branch 0 taken 129640 times.
✓ Branch 1 taken 64820 times.
|
194460 | if (n < 4) |
43 | 129640 | level = block[0] * s->y_dc_scale; | |
44 | else | ||
45 | 64820 | level = block[0] * s->c_dc_scale; | |
46 | 194460 | qadd = (qscale - 1) | 1; | |
47 | }else{ | ||
48 | ✗ | qadd = 0; | |
49 | ✗ | level= block[0]; | |
50 | } | ||
51 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 194460 times.
|
194460 | if(s->ac_pred) |
52 | ✗ | nCoeffs=63; | |
53 | else | ||
54 | 194460 | nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]; | |
55 | |||
56 | 194460 | __asm__ volatile( | |
57 | "movd %1, %%mm6 \n\t" //qmul | ||
58 | "packssdw %%mm6, %%mm6 \n\t" | ||
59 | "packssdw %%mm6, %%mm6 \n\t" | ||
60 | "movd %2, %%mm5 \n\t" //qadd | ||
61 | "pxor %%mm7, %%mm7 \n\t" | ||
62 | "packssdw %%mm5, %%mm5 \n\t" | ||
63 | "packssdw %%mm5, %%mm5 \n\t" | ||
64 | "psubw %%mm5, %%mm7 \n\t" | ||
65 | "pxor %%mm4, %%mm4 \n\t" | ||
66 | ".p2align 4 \n\t" | ||
67 | "1: \n\t" | ||
68 | "movq (%0, %3), %%mm0 \n\t" | ||
69 | "movq 8(%0, %3), %%mm1 \n\t" | ||
70 | |||
71 | "pmullw %%mm6, %%mm0 \n\t" | ||
72 | "pmullw %%mm6, %%mm1 \n\t" | ||
73 | |||
74 | "movq (%0, %3), %%mm2 \n\t" | ||
75 | "movq 8(%0, %3), %%mm3 \n\t" | ||
76 | |||
77 | "pcmpgtw %%mm4, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 | ||
78 | "pcmpgtw %%mm4, %%mm3 \n\t" // block[i] < 0 ? -1 : 0 | ||
79 | |||
80 | "pxor %%mm2, %%mm0 \n\t" | ||
81 | "pxor %%mm3, %%mm1 \n\t" | ||
82 | |||
83 | "paddw %%mm7, %%mm0 \n\t" | ||
84 | "paddw %%mm7, %%mm1 \n\t" | ||
85 | |||
86 | "pxor %%mm0, %%mm2 \n\t" | ||
87 | "pxor %%mm1, %%mm3 \n\t" | ||
88 | |||
89 | "pcmpeqw %%mm7, %%mm0 \n\t" // block[i] == 0 ? -1 : 0 | ||
90 | "pcmpeqw %%mm7, %%mm1 \n\t" // block[i] == 0 ? -1 : 0 | ||
91 | |||
92 | "pandn %%mm2, %%mm0 \n\t" | ||
93 | "pandn %%mm3, %%mm1 \n\t" | ||
94 | |||
95 | "movq %%mm0, (%0, %3) \n\t" | ||
96 | "movq %%mm1, 8(%0, %3) \n\t" | ||
97 | |||
98 | "add $16, %3 \n\t" | ||
99 | "jng 1b \n\t" | ||
100 | 194460 | ::"r" (block+nCoeffs), "rm"(qmul), "rm" (qadd), "r" (2*(-nCoeffs)) | |
101 | : "memory" | ||
102 | ); | ||
103 | 194460 | block[0]= level; | |
104 | 194460 | } | |
105 | |||
106 | |||
107 | 49919 | static void dct_unquantize_h263_inter_mmx(MpegEncContext *s, | |
108 | int16_t *block, int n, int qscale) | ||
109 | { | ||
110 | x86_reg qmul, qadd, nCoeffs; | ||
111 | |||
112 | 49919 | qmul = qscale << 1; | |
113 | 49919 | qadd = (qscale - 1) | 1; | |
114 | |||
115 | av_assert2(s->block_last_index[n]>=0 || s->h263_aic); | ||
116 | |||
117 | 49919 | nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ]; | |
118 | |||
119 | 49919 | __asm__ volatile( | |
120 | "movd %1, %%mm6 \n\t" //qmul | ||
121 | "packssdw %%mm6, %%mm6 \n\t" | ||
122 | "packssdw %%mm6, %%mm6 \n\t" | ||
123 | "movd %2, %%mm5 \n\t" //qadd | ||
124 | "pxor %%mm7, %%mm7 \n\t" | ||
125 | "packssdw %%mm5, %%mm5 \n\t" | ||
126 | "packssdw %%mm5, %%mm5 \n\t" | ||
127 | "psubw %%mm5, %%mm7 \n\t" | ||
128 | "pxor %%mm4, %%mm4 \n\t" | ||
129 | ".p2align 4 \n\t" | ||
130 | "1: \n\t" | ||
131 | "movq (%0, %3), %%mm0 \n\t" | ||
132 | "movq 8(%0, %3), %%mm1 \n\t" | ||
133 | |||
134 | "pmullw %%mm6, %%mm0 \n\t" | ||
135 | "pmullw %%mm6, %%mm1 \n\t" | ||
136 | |||
137 | "movq (%0, %3), %%mm2 \n\t" | ||
138 | "movq 8(%0, %3), %%mm3 \n\t" | ||
139 | |||
140 | "pcmpgtw %%mm4, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 | ||
141 | "pcmpgtw %%mm4, %%mm3 \n\t" // block[i] < 0 ? -1 : 0 | ||
142 | |||
143 | "pxor %%mm2, %%mm0 \n\t" | ||
144 | "pxor %%mm3, %%mm1 \n\t" | ||
145 | |||
146 | "paddw %%mm7, %%mm0 \n\t" | ||
147 | "paddw %%mm7, %%mm1 \n\t" | ||
148 | |||
149 | "pxor %%mm0, %%mm2 \n\t" | ||
150 | "pxor %%mm1, %%mm3 \n\t" | ||
151 | |||
152 | "pcmpeqw %%mm7, %%mm0 \n\t" // block[i] == 0 ? -1 : 0 | ||
153 | "pcmpeqw %%mm7, %%mm1 \n\t" // block[i] == 0 ? -1 : 0 | ||
154 | |||
155 | "pandn %%mm2, %%mm0 \n\t" | ||
156 | "pandn %%mm3, %%mm1 \n\t" | ||
157 | |||
158 | "movq %%mm0, (%0, %3) \n\t" | ||
159 | "movq %%mm1, 8(%0, %3) \n\t" | ||
160 | |||
161 | "add $16, %3 \n\t" | ||
162 | "jng 1b \n\t" | ||
163 | 49919 | ::"r" (block+nCoeffs), "rm"(qmul), "rm" (qadd), "r" (2*(-nCoeffs)) | |
164 | : "memory" | ||
165 | ); | ||
166 | 49919 | } | |
167 | |||
168 | ✗ | static void dct_unquantize_mpeg1_intra_mmx(MpegEncContext *s, | |
169 | int16_t *block, int n, int qscale) | ||
170 | { | ||
171 | x86_reg nCoeffs; | ||
172 | const uint16_t *quant_matrix; | ||
173 | int block0; | ||
174 | |||
175 | av_assert2(s->block_last_index[n]>=0); | ||
176 | |||
177 | ✗ | nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]+1; | |
178 | |||
179 | ✗ | if (n < 4) | |
180 | ✗ | block0 = block[0] * s->y_dc_scale; | |
181 | else | ||
182 | ✗ | block0 = block[0] * s->c_dc_scale; | |
183 | /* XXX: only MPEG-1 */ | ||
184 | ✗ | quant_matrix = s->intra_matrix; | |
185 | ✗ | __asm__ volatile( | |
186 | "pcmpeqw %%mm7, %%mm7 \n\t" | ||
187 | "psrlw $15, %%mm7 \n\t" | ||
188 | "movd %2, %%mm6 \n\t" | ||
189 | "packssdw %%mm6, %%mm6 \n\t" | ||
190 | "packssdw %%mm6, %%mm6 \n\t" | ||
191 | "mov %3, %%"FF_REG_a" \n\t" | ||
192 | ".p2align 4 \n\t" | ||
193 | "1: \n\t" | ||
194 | "movq (%0, %%"FF_REG_a"), %%mm0 \n\t" | ||
195 | "movq 8(%0, %%"FF_REG_a"), %%mm1\n\t" | ||
196 | "movq (%1, %%"FF_REG_a"), %%mm4 \n\t" | ||
197 | "movq 8(%1, %%"FF_REG_a"), %%mm5\n\t" | ||
198 | "pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i] | ||
199 | "pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i] | ||
200 | "pxor %%mm2, %%mm2 \n\t" | ||
201 | "pxor %%mm3, %%mm3 \n\t" | ||
202 | "pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 | ||
203 | "pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? -1 : 0 | ||
204 | "pxor %%mm2, %%mm0 \n\t" | ||
205 | "pxor %%mm3, %%mm1 \n\t" | ||
206 | "psubw %%mm2, %%mm0 \n\t" // abs(block[i]) | ||
207 | "psubw %%mm3, %%mm1 \n\t" // abs(block[i]) | ||
208 | "pmullw %%mm4, %%mm0 \n\t" // abs(block[i])*q | ||
209 | "pmullw %%mm5, %%mm1 \n\t" // abs(block[i])*q | ||
210 | "pxor %%mm4, %%mm4 \n\t" | ||
211 | "pxor %%mm5, %%mm5 \n\t" // FIXME slow | ||
212 | "pcmpeqw (%0, %%"FF_REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0 | ||
213 | "pcmpeqw 8(%0, %%"FF_REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0 | ||
214 | "psraw $3, %%mm0 \n\t" | ||
215 | "psraw $3, %%mm1 \n\t" | ||
216 | "psubw %%mm7, %%mm0 \n\t" | ||
217 | "psubw %%mm7, %%mm1 \n\t" | ||
218 | "por %%mm7, %%mm0 \n\t" | ||
219 | "por %%mm7, %%mm1 \n\t" | ||
220 | "pxor %%mm2, %%mm0 \n\t" | ||
221 | "pxor %%mm3, %%mm1 \n\t" | ||
222 | "psubw %%mm2, %%mm0 \n\t" | ||
223 | "psubw %%mm3, %%mm1 \n\t" | ||
224 | "pandn %%mm0, %%mm4 \n\t" | ||
225 | "pandn %%mm1, %%mm5 \n\t" | ||
226 | "movq %%mm4, (%0, %%"FF_REG_a") \n\t" | ||
227 | "movq %%mm5, 8(%0, %%"FF_REG_a")\n\t" | ||
228 | |||
229 | "add $16, %%"FF_REG_a" \n\t" | ||
230 | "js 1b \n\t" | ||
231 | ✗ | ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale), "g" (-2*nCoeffs) | |
232 | : "%"FF_REG_a, "memory" | ||
233 | ); | ||
234 | ✗ | block[0]= block0; | |
235 | ✗ | } | |
236 | |||
237 | ✗ | static void dct_unquantize_mpeg1_inter_mmx(MpegEncContext *s, | |
238 | int16_t *block, int n, int qscale) | ||
239 | { | ||
240 | x86_reg nCoeffs; | ||
241 | const uint16_t *quant_matrix; | ||
242 | |||
243 | av_assert2(s->block_last_index[n]>=0); | ||
244 | |||
245 | ✗ | nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]+1; | |
246 | |||
247 | ✗ | quant_matrix = s->inter_matrix; | |
248 | ✗ | __asm__ volatile( | |
249 | "pcmpeqw %%mm7, %%mm7 \n\t" | ||
250 | "psrlw $15, %%mm7 \n\t" | ||
251 | "movd %2, %%mm6 \n\t" | ||
252 | "packssdw %%mm6, %%mm6 \n\t" | ||
253 | "packssdw %%mm6, %%mm6 \n\t" | ||
254 | "mov %3, %%"FF_REG_a" \n\t" | ||
255 | ".p2align 4 \n\t" | ||
256 | "1: \n\t" | ||
257 | "movq (%0, %%"FF_REG_a"), %%mm0 \n\t" | ||
258 | "movq 8(%0, %%"FF_REG_a"), %%mm1\n\t" | ||
259 | "movq (%1, %%"FF_REG_a"), %%mm4 \n\t" | ||
260 | "movq 8(%1, %%"FF_REG_a"), %%mm5\n\t" | ||
261 | "pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i] | ||
262 | "pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i] | ||
263 | "pxor %%mm2, %%mm2 \n\t" | ||
264 | "pxor %%mm3, %%mm3 \n\t" | ||
265 | "pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 | ||
266 | "pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? -1 : 0 | ||
267 | "pxor %%mm2, %%mm0 \n\t" | ||
268 | "pxor %%mm3, %%mm1 \n\t" | ||
269 | "psubw %%mm2, %%mm0 \n\t" // abs(block[i]) | ||
270 | "psubw %%mm3, %%mm1 \n\t" // abs(block[i]) | ||
271 | "paddw %%mm0, %%mm0 \n\t" // abs(block[i])*2 | ||
272 | "paddw %%mm1, %%mm1 \n\t" // abs(block[i])*2 | ||
273 | "paddw %%mm7, %%mm0 \n\t" // abs(block[i])*2 + 1 | ||
274 | "paddw %%mm7, %%mm1 \n\t" // abs(block[i])*2 + 1 | ||
275 | "pmullw %%mm4, %%mm0 \n\t" // (abs(block[i])*2 + 1)*q | ||
276 | "pmullw %%mm5, %%mm1 \n\t" // (abs(block[i])*2 + 1)*q | ||
277 | "pxor %%mm4, %%mm4 \n\t" | ||
278 | "pxor %%mm5, %%mm5 \n\t" // FIXME slow | ||
279 | "pcmpeqw (%0, %%"FF_REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0 | ||
280 | "pcmpeqw 8(%0, %%"FF_REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0 | ||
281 | "psraw $4, %%mm0 \n\t" | ||
282 | "psraw $4, %%mm1 \n\t" | ||
283 | "psubw %%mm7, %%mm0 \n\t" | ||
284 | "psubw %%mm7, %%mm1 \n\t" | ||
285 | "por %%mm7, %%mm0 \n\t" | ||
286 | "por %%mm7, %%mm1 \n\t" | ||
287 | "pxor %%mm2, %%mm0 \n\t" | ||
288 | "pxor %%mm3, %%mm1 \n\t" | ||
289 | "psubw %%mm2, %%mm0 \n\t" | ||
290 | "psubw %%mm3, %%mm1 \n\t" | ||
291 | "pandn %%mm0, %%mm4 \n\t" | ||
292 | "pandn %%mm1, %%mm5 \n\t" | ||
293 | "movq %%mm4, (%0, %%"FF_REG_a") \n\t" | ||
294 | "movq %%mm5, 8(%0, %%"FF_REG_a")\n\t" | ||
295 | |||
296 | "add $16, %%"FF_REG_a" \n\t" | ||
297 | "js 1b \n\t" | ||
298 | ✗ | ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale), "g" (-2*nCoeffs) | |
299 | : "%"FF_REG_a, "memory" | ||
300 | ); | ||
301 | ✗ | } | |
302 | |||
303 | ✗ | static void dct_unquantize_mpeg2_intra_mmx(MpegEncContext *s, | |
304 | int16_t *block, int n, int qscale) | ||
305 | { | ||
306 | x86_reg nCoeffs; | ||
307 | const uint16_t *quant_matrix; | ||
308 | int block0; | ||
309 | |||
310 | av_assert2(s->block_last_index[n]>=0); | ||
311 | |||
312 | ✗ | if (s->q_scale_type) qscale = ff_mpeg2_non_linear_qscale[qscale]; | |
313 | ✗ | else qscale <<= 1; | |
314 | |||
315 | ✗ | nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]; | |
316 | |||
317 | ✗ | if (n < 4) | |
318 | ✗ | block0 = block[0] * s->y_dc_scale; | |
319 | else | ||
320 | ✗ | block0 = block[0] * s->c_dc_scale; | |
321 | ✗ | quant_matrix = s->intra_matrix; | |
322 | ✗ | __asm__ volatile( | |
323 | "pcmpeqw %%mm7, %%mm7 \n\t" | ||
324 | "psrlw $15, %%mm7 \n\t" | ||
325 | "movd %2, %%mm6 \n\t" | ||
326 | "packssdw %%mm6, %%mm6 \n\t" | ||
327 | "packssdw %%mm6, %%mm6 \n\t" | ||
328 | "mov %3, %%"FF_REG_a" \n\t" | ||
329 | ".p2align 4 \n\t" | ||
330 | "1: \n\t" | ||
331 | "movq (%0, %%"FF_REG_a"), %%mm0 \n\t" | ||
332 | "movq 8(%0, %%"FF_REG_a"), %%mm1\n\t" | ||
333 | "movq (%1, %%"FF_REG_a"), %%mm4 \n\t" | ||
334 | "movq 8(%1, %%"FF_REG_a"), %%mm5\n\t" | ||
335 | "pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i] | ||
336 | "pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i] | ||
337 | "pxor %%mm2, %%mm2 \n\t" | ||
338 | "pxor %%mm3, %%mm3 \n\t" | ||
339 | "pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 | ||
340 | "pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? -1 : 0 | ||
341 | "pxor %%mm2, %%mm0 \n\t" | ||
342 | "pxor %%mm3, %%mm1 \n\t" | ||
343 | "psubw %%mm2, %%mm0 \n\t" // abs(block[i]) | ||
344 | "psubw %%mm3, %%mm1 \n\t" // abs(block[i]) | ||
345 | "pmullw %%mm4, %%mm0 \n\t" // abs(block[i])*q | ||
346 | "pmullw %%mm5, %%mm1 \n\t" // abs(block[i])*q | ||
347 | "pxor %%mm4, %%mm4 \n\t" | ||
348 | "pxor %%mm5, %%mm5 \n\t" // FIXME slow | ||
349 | "pcmpeqw (%0, %%"FF_REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0 | ||
350 | "pcmpeqw 8(%0, %%"FF_REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0 | ||
351 | "psraw $4, %%mm0 \n\t" | ||
352 | "psraw $4, %%mm1 \n\t" | ||
353 | "pxor %%mm2, %%mm0 \n\t" | ||
354 | "pxor %%mm3, %%mm1 \n\t" | ||
355 | "psubw %%mm2, %%mm0 \n\t" | ||
356 | "psubw %%mm3, %%mm1 \n\t" | ||
357 | "pandn %%mm0, %%mm4 \n\t" | ||
358 | "pandn %%mm1, %%mm5 \n\t" | ||
359 | "movq %%mm4, (%0, %%"FF_REG_a") \n\t" | ||
360 | "movq %%mm5, 8(%0, %%"FF_REG_a")\n\t" | ||
361 | |||
362 | "add $16, %%"FF_REG_a" \n\t" | ||
363 | "jng 1b \n\t" | ||
364 | ✗ | ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale), "g" (-2*nCoeffs) | |
365 | : "%"FF_REG_a, "memory" | ||
366 | ); | ||
367 | ✗ | block[0]= block0; | |
368 | //Note, we do not do mismatch control for intra as errors cannot accumulate | ||
369 | ✗ | } | |
370 | |||
371 | 5941 | static void dct_unquantize_mpeg2_inter_mmx(MpegEncContext *s, | |
372 | int16_t *block, int n, int qscale) | ||
373 | { | ||
374 | x86_reg nCoeffs; | ||
375 | const uint16_t *quant_matrix; | ||
376 | |||
377 | av_assert2(s->block_last_index[n]>=0); | ||
378 | |||
379 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 5941 times.
|
5941 | if (s->q_scale_type) qscale = ff_mpeg2_non_linear_qscale[qscale]; |
380 | 5941 | else qscale <<= 1; | |
381 | |||
382 | 5941 | nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]; | |
383 | |||
384 | 5941 | quant_matrix = s->inter_matrix; | |
385 | 5941 | __asm__ volatile( | |
386 | "pcmpeqw %%mm7, %%mm7 \n\t" | ||
387 | "psrlq $48, %%mm7 \n\t" | ||
388 | "movd %2, %%mm6 \n\t" | ||
389 | "packssdw %%mm6, %%mm6 \n\t" | ||
390 | "packssdw %%mm6, %%mm6 \n\t" | ||
391 | "mov %3, %%"FF_REG_a" \n\t" | ||
392 | ".p2align 4 \n\t" | ||
393 | "1: \n\t" | ||
394 | "movq (%0, %%"FF_REG_a"), %%mm0 \n\t" | ||
395 | "movq 8(%0, %%"FF_REG_a"), %%mm1\n\t" | ||
396 | "movq (%1, %%"FF_REG_a"), %%mm4 \n\t" | ||
397 | "movq 8(%1, %%"FF_REG_a"), %%mm5\n\t" | ||
398 | "pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i] | ||
399 | "pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i] | ||
400 | "pxor %%mm2, %%mm2 \n\t" | ||
401 | "pxor %%mm3, %%mm3 \n\t" | ||
402 | "pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 | ||
403 | "pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? -1 : 0 | ||
404 | "pxor %%mm2, %%mm0 \n\t" | ||
405 | "pxor %%mm3, %%mm1 \n\t" | ||
406 | "psubw %%mm2, %%mm0 \n\t" // abs(block[i]) | ||
407 | "psubw %%mm3, %%mm1 \n\t" // abs(block[i]) | ||
408 | "paddw %%mm0, %%mm0 \n\t" // abs(block[i])*2 | ||
409 | "paddw %%mm1, %%mm1 \n\t" // abs(block[i])*2 | ||
410 | "pmullw %%mm4, %%mm0 \n\t" // abs(block[i])*2*q | ||
411 | "pmullw %%mm5, %%mm1 \n\t" // abs(block[i])*2*q | ||
412 | "paddw %%mm4, %%mm0 \n\t" // (abs(block[i])*2 + 1)*q | ||
413 | "paddw %%mm5, %%mm1 \n\t" // (abs(block[i])*2 + 1)*q | ||
414 | "pxor %%mm4, %%mm4 \n\t" | ||
415 | "pxor %%mm5, %%mm5 \n\t" // FIXME slow | ||
416 | "pcmpeqw (%0, %%"FF_REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0 | ||
417 | "pcmpeqw 8(%0, %%"FF_REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0 | ||
418 | "psrlw $5, %%mm0 \n\t" | ||
419 | "psrlw $5, %%mm1 \n\t" | ||
420 | "pxor %%mm2, %%mm0 \n\t" | ||
421 | "pxor %%mm3, %%mm1 \n\t" | ||
422 | "psubw %%mm2, %%mm0 \n\t" | ||
423 | "psubw %%mm3, %%mm1 \n\t" | ||
424 | "pandn %%mm0, %%mm4 \n\t" | ||
425 | "pandn %%mm1, %%mm5 \n\t" | ||
426 | "pxor %%mm4, %%mm7 \n\t" | ||
427 | "pxor %%mm5, %%mm7 \n\t" | ||
428 | "movq %%mm4, (%0, %%"FF_REG_a") \n\t" | ||
429 | "movq %%mm5, 8(%0, %%"FF_REG_a")\n\t" | ||
430 | |||
431 | "add $16, %%"FF_REG_a" \n\t" | ||
432 | "jng 1b \n\t" | ||
433 | "movd 124(%0, %3), %%mm0 \n\t" | ||
434 | "movq %%mm7, %%mm6 \n\t" | ||
435 | "psrlq $32, %%mm7 \n\t" | ||
436 | "pxor %%mm6, %%mm7 \n\t" | ||
437 | "movq %%mm7, %%mm6 \n\t" | ||
438 | "psrlq $16, %%mm7 \n\t" | ||
439 | "pxor %%mm6, %%mm7 \n\t" | ||
440 | "pslld $31, %%mm7 \n\t" | ||
441 | "psrlq $15, %%mm7 \n\t" | ||
442 | "pxor %%mm7, %%mm0 \n\t" | ||
443 | "movd %%mm0, 124(%0, %3) \n\t" | ||
444 | |||
445 | 5941 | ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale), "r" (-2*nCoeffs) | |
446 | : "%"FF_REG_a, "memory" | ||
447 | ); | ||
448 | 5941 | } | |
449 | |||
450 | #endif /* HAVE_MMX_INLINE */ | ||
451 | |||
452 | 918 | av_cold void ff_mpv_common_init_x86(MpegEncContext *s) | |
453 | { | ||
454 | #if HAVE_MMX_INLINE | ||
455 | 918 | int cpu_flags = av_get_cpu_flags(); | |
456 | |||
457 |
2/2✓ Branch 0 taken 163 times.
✓ Branch 1 taken 755 times.
|
918 | if (INLINE_MMX(cpu_flags)) { |
458 | 163 | s->dct_unquantize_h263_intra = dct_unquantize_h263_intra_mmx; | |
459 | 163 | s->dct_unquantize_h263_inter = dct_unquantize_h263_inter_mmx; | |
460 | 163 | s->dct_unquantize_mpeg1_intra = dct_unquantize_mpeg1_intra_mmx; | |
461 | 163 | s->dct_unquantize_mpeg1_inter = dct_unquantize_mpeg1_inter_mmx; | |
462 |
2/2✓ Branch 0 taken 158 times.
✓ Branch 1 taken 5 times.
|
163 | if (!(s->avctx->flags & AV_CODEC_FLAG_BITEXACT)) |
463 | 158 | s->dct_unquantize_mpeg2_intra = dct_unquantize_mpeg2_intra_mmx; | |
464 | 163 | s->dct_unquantize_mpeg2_inter = dct_unquantize_mpeg2_inter_mmx; | |
465 | } | ||
466 | #endif /* HAVE_MMX_INLINE */ | ||
467 | 918 | } | |
468 |