Line | Branch | Exec | Source |
---|---|---|---|
1 | /* | ||
2 | * Optimized for ia32 CPUs by Nick Kurshev <nickols_k@mail.ru> | ||
3 | * H.263, MPEG-1, MPEG-2 dequantizer & draw_edges by Michael Niedermayer <michaelni@gmx.at> | ||
4 | * | ||
5 | * This file is part of FFmpeg. | ||
6 | * | ||
7 | * FFmpeg is free software; you can redistribute it and/or | ||
8 | * modify it under the terms of the GNU Lesser General Public | ||
9 | * License as published by the Free Software Foundation; either | ||
10 | * version 2.1 of the License, or (at your option) any later version. | ||
11 | * | ||
12 | * FFmpeg is distributed in the hope that it will be useful, | ||
13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
15 | * Lesser General Public License for more details. | ||
16 | * | ||
17 | * You should have received a copy of the GNU Lesser General Public | ||
18 | * License along with FFmpeg; if not, write to the Free Software | ||
19 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | ||
20 | */ | ||
21 | |||
22 | #include "libavutil/attributes.h" | ||
23 | #include "libavutil/cpu.h" | ||
24 | #include "libavutil/x86/asm.h" | ||
25 | #include "libavutil/x86/cpu.h" | ||
26 | #include "libavcodec/avcodec.h" | ||
27 | #include "libavcodec/mpegvideo.h" | ||
28 | #include "libavcodec/mpegvideodata.h" | ||
29 | #include "libavcodec/mpegvideo_unquantize.h" | ||
30 | |||
31 | #if HAVE_MMX_INLINE | ||
32 | |||
33 | 194460 | static void dct_unquantize_h263_intra_mmx(MpegEncContext *s, | |
34 | int16_t *block, int n, int qscale) | ||
35 | { | ||
36 | x86_reg level, qmul, qadd, nCoeffs; | ||
37 | |||
38 | 194460 | qmul = qscale << 1; | |
39 | |||
40 | av_assert2(s->block_last_index[n]>=0 || s->h263_aic); | ||
41 | |||
42 |
1/2✓ Branch 0 taken 194460 times.
✗ Branch 1 not taken.
|
194460 | if (!s->h263_aic) { |
43 |
2/2✓ Branch 0 taken 129640 times.
✓ Branch 1 taken 64820 times.
|
194460 | if (n < 4) |
44 | 129640 | level = block[0] * s->y_dc_scale; | |
45 | else | ||
46 | 64820 | level = block[0] * s->c_dc_scale; | |
47 | 194460 | qadd = (qscale - 1) | 1; | |
48 | }else{ | ||
49 | ✗ | qadd = 0; | |
50 | ✗ | level= block[0]; | |
51 | } | ||
52 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 194460 times.
|
194460 | if(s->ac_pred) |
53 | ✗ | nCoeffs=63; | |
54 | else | ||
55 | 194460 | nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]; | |
56 | |||
57 | 194460 | __asm__ volatile( | |
58 | "movd %1, %%mm6 \n\t" //qmul | ||
59 | "packssdw %%mm6, %%mm6 \n\t" | ||
60 | "packssdw %%mm6, %%mm6 \n\t" | ||
61 | "movd %2, %%mm5 \n\t" //qadd | ||
62 | "pxor %%mm7, %%mm7 \n\t" | ||
63 | "packssdw %%mm5, %%mm5 \n\t" | ||
64 | "packssdw %%mm5, %%mm5 \n\t" | ||
65 | "psubw %%mm5, %%mm7 \n\t" | ||
66 | "pxor %%mm4, %%mm4 \n\t" | ||
67 | ".p2align 4 \n\t" | ||
68 | "1: \n\t" | ||
69 | "movq (%0, %3), %%mm0 \n\t" | ||
70 | "movq 8(%0, %3), %%mm1 \n\t" | ||
71 | |||
72 | "pmullw %%mm6, %%mm0 \n\t" | ||
73 | "pmullw %%mm6, %%mm1 \n\t" | ||
74 | |||
75 | "movq (%0, %3), %%mm2 \n\t" | ||
76 | "movq 8(%0, %3), %%mm3 \n\t" | ||
77 | |||
78 | "pcmpgtw %%mm4, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 | ||
79 | "pcmpgtw %%mm4, %%mm3 \n\t" // block[i] < 0 ? -1 : 0 | ||
80 | |||
81 | "pxor %%mm2, %%mm0 \n\t" | ||
82 | "pxor %%mm3, %%mm1 \n\t" | ||
83 | |||
84 | "paddw %%mm7, %%mm0 \n\t" | ||
85 | "paddw %%mm7, %%mm1 \n\t" | ||
86 | |||
87 | "pxor %%mm0, %%mm2 \n\t" | ||
88 | "pxor %%mm1, %%mm3 \n\t" | ||
89 | |||
90 | "pcmpeqw %%mm7, %%mm0 \n\t" // block[i] == 0 ? -1 : 0 | ||
91 | "pcmpeqw %%mm7, %%mm1 \n\t" // block[i] == 0 ? -1 : 0 | ||
92 | |||
93 | "pandn %%mm2, %%mm0 \n\t" | ||
94 | "pandn %%mm3, %%mm1 \n\t" | ||
95 | |||
96 | "movq %%mm0, (%0, %3) \n\t" | ||
97 | "movq %%mm1, 8(%0, %3) \n\t" | ||
98 | |||
99 | "add $16, %3 \n\t" | ||
100 | "jng 1b \n\t" | ||
101 | 194460 | ::"r" (block+nCoeffs), "rm"(qmul), "rm" (qadd), "r" (2*(-nCoeffs)) | |
102 | : "memory" | ||
103 | ); | ||
104 | 194460 | block[0]= level; | |
105 | 194460 | } | |
106 | |||
107 | |||
108 | 49919 | static void dct_unquantize_h263_inter_mmx(MpegEncContext *s, | |
109 | int16_t *block, int n, int qscale) | ||
110 | { | ||
111 | x86_reg qmul, qadd, nCoeffs; | ||
112 | |||
113 | 49919 | qmul = qscale << 1; | |
114 | 49919 | qadd = (qscale - 1) | 1; | |
115 | |||
116 | av_assert2(s->block_last_index[n]>=0 || s->h263_aic); | ||
117 | |||
118 | 49919 | nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ]; | |
119 | |||
120 | 49919 | __asm__ volatile( | |
121 | "movd %1, %%mm6 \n\t" //qmul | ||
122 | "packssdw %%mm6, %%mm6 \n\t" | ||
123 | "packssdw %%mm6, %%mm6 \n\t" | ||
124 | "movd %2, %%mm5 \n\t" //qadd | ||
125 | "pxor %%mm7, %%mm7 \n\t" | ||
126 | "packssdw %%mm5, %%mm5 \n\t" | ||
127 | "packssdw %%mm5, %%mm5 \n\t" | ||
128 | "psubw %%mm5, %%mm7 \n\t" | ||
129 | "pxor %%mm4, %%mm4 \n\t" | ||
130 | ".p2align 4 \n\t" | ||
131 | "1: \n\t" | ||
132 | "movq (%0, %3), %%mm0 \n\t" | ||
133 | "movq 8(%0, %3), %%mm1 \n\t" | ||
134 | |||
135 | "pmullw %%mm6, %%mm0 \n\t" | ||
136 | "pmullw %%mm6, %%mm1 \n\t" | ||
137 | |||
138 | "movq (%0, %3), %%mm2 \n\t" | ||
139 | "movq 8(%0, %3), %%mm3 \n\t" | ||
140 | |||
141 | "pcmpgtw %%mm4, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 | ||
142 | "pcmpgtw %%mm4, %%mm3 \n\t" // block[i] < 0 ? -1 : 0 | ||
143 | |||
144 | "pxor %%mm2, %%mm0 \n\t" | ||
145 | "pxor %%mm3, %%mm1 \n\t" | ||
146 | |||
147 | "paddw %%mm7, %%mm0 \n\t" | ||
148 | "paddw %%mm7, %%mm1 \n\t" | ||
149 | |||
150 | "pxor %%mm0, %%mm2 \n\t" | ||
151 | "pxor %%mm1, %%mm3 \n\t" | ||
152 | |||
153 | "pcmpeqw %%mm7, %%mm0 \n\t" // block[i] == 0 ? -1 : 0 | ||
154 | "pcmpeqw %%mm7, %%mm1 \n\t" // block[i] == 0 ? -1 : 0 | ||
155 | |||
156 | "pandn %%mm2, %%mm0 \n\t" | ||
157 | "pandn %%mm3, %%mm1 \n\t" | ||
158 | |||
159 | "movq %%mm0, (%0, %3) \n\t" | ||
160 | "movq %%mm1, 8(%0, %3) \n\t" | ||
161 | |||
162 | "add $16, %3 \n\t" | ||
163 | "jng 1b \n\t" | ||
164 | 49919 | ::"r" (block+nCoeffs), "rm"(qmul), "rm" (qadd), "r" (2*(-nCoeffs)) | |
165 | : "memory" | ||
166 | ); | ||
167 | 49919 | } | |
168 | |||
169 | ✗ | static void dct_unquantize_mpeg1_intra_mmx(MpegEncContext *s, | |
170 | int16_t *block, int n, int qscale) | ||
171 | { | ||
172 | x86_reg nCoeffs; | ||
173 | const uint16_t *quant_matrix; | ||
174 | int block0; | ||
175 | |||
176 | av_assert2(s->block_last_index[n]>=0); | ||
177 | |||
178 | ✗ | nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]+1; | |
179 | |||
180 | ✗ | if (n < 4) | |
181 | ✗ | block0 = block[0] * s->y_dc_scale; | |
182 | else | ||
183 | ✗ | block0 = block[0] * s->c_dc_scale; | |
184 | /* XXX: only MPEG-1 */ | ||
185 | ✗ | quant_matrix = s->intra_matrix; | |
186 | ✗ | __asm__ volatile( | |
187 | "pcmpeqw %%mm7, %%mm7 \n\t" | ||
188 | "psrlw $15, %%mm7 \n\t" | ||
189 | "movd %2, %%mm6 \n\t" | ||
190 | "packssdw %%mm6, %%mm6 \n\t" | ||
191 | "packssdw %%mm6, %%mm6 \n\t" | ||
192 | "mov %3, %%"FF_REG_a" \n\t" | ||
193 | ".p2align 4 \n\t" | ||
194 | "1: \n\t" | ||
195 | "movq (%0, %%"FF_REG_a"), %%mm0 \n\t" | ||
196 | "movq 8(%0, %%"FF_REG_a"), %%mm1\n\t" | ||
197 | "movq (%1, %%"FF_REG_a"), %%mm4 \n\t" | ||
198 | "movq 8(%1, %%"FF_REG_a"), %%mm5\n\t" | ||
199 | "pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i] | ||
200 | "pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i] | ||
201 | "pxor %%mm2, %%mm2 \n\t" | ||
202 | "pxor %%mm3, %%mm3 \n\t" | ||
203 | "pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 | ||
204 | "pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? -1 : 0 | ||
205 | "pxor %%mm2, %%mm0 \n\t" | ||
206 | "pxor %%mm3, %%mm1 \n\t" | ||
207 | "psubw %%mm2, %%mm0 \n\t" // abs(block[i]) | ||
208 | "psubw %%mm3, %%mm1 \n\t" // abs(block[i]) | ||
209 | "pmullw %%mm4, %%mm0 \n\t" // abs(block[i])*q | ||
210 | "pmullw %%mm5, %%mm1 \n\t" // abs(block[i])*q | ||
211 | "pxor %%mm4, %%mm4 \n\t" | ||
212 | "pxor %%mm5, %%mm5 \n\t" // FIXME slow | ||
213 | "pcmpeqw (%0, %%"FF_REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0 | ||
214 | "pcmpeqw 8(%0, %%"FF_REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0 | ||
215 | "psraw $3, %%mm0 \n\t" | ||
216 | "psraw $3, %%mm1 \n\t" | ||
217 | "psubw %%mm7, %%mm0 \n\t" | ||
218 | "psubw %%mm7, %%mm1 \n\t" | ||
219 | "por %%mm7, %%mm0 \n\t" | ||
220 | "por %%mm7, %%mm1 \n\t" | ||
221 | "pxor %%mm2, %%mm0 \n\t" | ||
222 | "pxor %%mm3, %%mm1 \n\t" | ||
223 | "psubw %%mm2, %%mm0 \n\t" | ||
224 | "psubw %%mm3, %%mm1 \n\t" | ||
225 | "pandn %%mm0, %%mm4 \n\t" | ||
226 | "pandn %%mm1, %%mm5 \n\t" | ||
227 | "movq %%mm4, (%0, %%"FF_REG_a") \n\t" | ||
228 | "movq %%mm5, 8(%0, %%"FF_REG_a")\n\t" | ||
229 | |||
230 | "add $16, %%"FF_REG_a" \n\t" | ||
231 | "js 1b \n\t" | ||
232 | ✗ | ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale), "g" (-2*nCoeffs) | |
233 | : "%"FF_REG_a, "memory" | ||
234 | ); | ||
235 | ✗ | block[0]= block0; | |
236 | ✗ | } | |
237 | |||
238 | ✗ | static void dct_unquantize_mpeg1_inter_mmx(MpegEncContext *s, | |
239 | int16_t *block, int n, int qscale) | ||
240 | { | ||
241 | x86_reg nCoeffs; | ||
242 | const uint16_t *quant_matrix; | ||
243 | |||
244 | av_assert2(s->block_last_index[n]>=0); | ||
245 | |||
246 | ✗ | nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]+1; | |
247 | |||
248 | ✗ | quant_matrix = s->inter_matrix; | |
249 | ✗ | __asm__ volatile( | |
250 | "pcmpeqw %%mm7, %%mm7 \n\t" | ||
251 | "psrlw $15, %%mm7 \n\t" | ||
252 | "movd %2, %%mm6 \n\t" | ||
253 | "packssdw %%mm6, %%mm6 \n\t" | ||
254 | "packssdw %%mm6, %%mm6 \n\t" | ||
255 | "mov %3, %%"FF_REG_a" \n\t" | ||
256 | ".p2align 4 \n\t" | ||
257 | "1: \n\t" | ||
258 | "movq (%0, %%"FF_REG_a"), %%mm0 \n\t" | ||
259 | "movq 8(%0, %%"FF_REG_a"), %%mm1\n\t" | ||
260 | "movq (%1, %%"FF_REG_a"), %%mm4 \n\t" | ||
261 | "movq 8(%1, %%"FF_REG_a"), %%mm5\n\t" | ||
262 | "pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i] | ||
263 | "pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i] | ||
264 | "pxor %%mm2, %%mm2 \n\t" | ||
265 | "pxor %%mm3, %%mm3 \n\t" | ||
266 | "pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 | ||
267 | "pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? -1 : 0 | ||
268 | "pxor %%mm2, %%mm0 \n\t" | ||
269 | "pxor %%mm3, %%mm1 \n\t" | ||
270 | "psubw %%mm2, %%mm0 \n\t" // abs(block[i]) | ||
271 | "psubw %%mm3, %%mm1 \n\t" // abs(block[i]) | ||
272 | "paddw %%mm0, %%mm0 \n\t" // abs(block[i])*2 | ||
273 | "paddw %%mm1, %%mm1 \n\t" // abs(block[i])*2 | ||
274 | "paddw %%mm7, %%mm0 \n\t" // abs(block[i])*2 + 1 | ||
275 | "paddw %%mm7, %%mm1 \n\t" // abs(block[i])*2 + 1 | ||
276 | "pmullw %%mm4, %%mm0 \n\t" // (abs(block[i])*2 + 1)*q | ||
277 | "pmullw %%mm5, %%mm1 \n\t" // (abs(block[i])*2 + 1)*q | ||
278 | "pxor %%mm4, %%mm4 \n\t" | ||
279 | "pxor %%mm5, %%mm5 \n\t" // FIXME slow | ||
280 | "pcmpeqw (%0, %%"FF_REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0 | ||
281 | "pcmpeqw 8(%0, %%"FF_REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0 | ||
282 | "psraw $4, %%mm0 \n\t" | ||
283 | "psraw $4, %%mm1 \n\t" | ||
284 | "psubw %%mm7, %%mm0 \n\t" | ||
285 | "psubw %%mm7, %%mm1 \n\t" | ||
286 | "por %%mm7, %%mm0 \n\t" | ||
287 | "por %%mm7, %%mm1 \n\t" | ||
288 | "pxor %%mm2, %%mm0 \n\t" | ||
289 | "pxor %%mm3, %%mm1 \n\t" | ||
290 | "psubw %%mm2, %%mm0 \n\t" | ||
291 | "psubw %%mm3, %%mm1 \n\t" | ||
292 | "pandn %%mm0, %%mm4 \n\t" | ||
293 | "pandn %%mm1, %%mm5 \n\t" | ||
294 | "movq %%mm4, (%0, %%"FF_REG_a") \n\t" | ||
295 | "movq %%mm5, 8(%0, %%"FF_REG_a")\n\t" | ||
296 | |||
297 | "add $16, %%"FF_REG_a" \n\t" | ||
298 | "js 1b \n\t" | ||
299 | ✗ | ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale), "g" (-2*nCoeffs) | |
300 | : "%"FF_REG_a, "memory" | ||
301 | ); | ||
302 | ✗ | } | |
303 | |||
304 | ✗ | static void dct_unquantize_mpeg2_intra_mmx(MpegEncContext *s, | |
305 | int16_t *block, int n, int qscale) | ||
306 | { | ||
307 | x86_reg nCoeffs; | ||
308 | const uint16_t *quant_matrix; | ||
309 | int block0; | ||
310 | |||
311 | av_assert2(s->block_last_index[n]>=0); | ||
312 | |||
313 | ✗ | if (s->q_scale_type) qscale = ff_mpeg2_non_linear_qscale[qscale]; | |
314 | ✗ | else qscale <<= 1; | |
315 | |||
316 | ✗ | nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]; | |
317 | |||
318 | ✗ | if (n < 4) | |
319 | ✗ | block0 = block[0] * s->y_dc_scale; | |
320 | else | ||
321 | ✗ | block0 = block[0] * s->c_dc_scale; | |
322 | ✗ | quant_matrix = s->intra_matrix; | |
323 | ✗ | __asm__ volatile( | |
324 | "pcmpeqw %%mm7, %%mm7 \n\t" | ||
325 | "psrlw $15, %%mm7 \n\t" | ||
326 | "movd %2, %%mm6 \n\t" | ||
327 | "packssdw %%mm6, %%mm6 \n\t" | ||
328 | "packssdw %%mm6, %%mm6 \n\t" | ||
329 | "mov %3, %%"FF_REG_a" \n\t" | ||
330 | ".p2align 4 \n\t" | ||
331 | "1: \n\t" | ||
332 | "movq (%0, %%"FF_REG_a"), %%mm0 \n\t" | ||
333 | "movq 8(%0, %%"FF_REG_a"), %%mm1\n\t" | ||
334 | "movq (%1, %%"FF_REG_a"), %%mm4 \n\t" | ||
335 | "movq 8(%1, %%"FF_REG_a"), %%mm5\n\t" | ||
336 | "pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i] | ||
337 | "pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i] | ||
338 | "pxor %%mm2, %%mm2 \n\t" | ||
339 | "pxor %%mm3, %%mm3 \n\t" | ||
340 | "pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 | ||
341 | "pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? -1 : 0 | ||
342 | "pxor %%mm2, %%mm0 \n\t" | ||
343 | "pxor %%mm3, %%mm1 \n\t" | ||
344 | "psubw %%mm2, %%mm0 \n\t" // abs(block[i]) | ||
345 | "psubw %%mm3, %%mm1 \n\t" // abs(block[i]) | ||
346 | "pmullw %%mm4, %%mm0 \n\t" // abs(block[i])*q | ||
347 | "pmullw %%mm5, %%mm1 \n\t" // abs(block[i])*q | ||
348 | "pxor %%mm4, %%mm4 \n\t" | ||
349 | "pxor %%mm5, %%mm5 \n\t" // FIXME slow | ||
350 | "pcmpeqw (%0, %%"FF_REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0 | ||
351 | "pcmpeqw 8(%0, %%"FF_REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0 | ||
352 | "psraw $4, %%mm0 \n\t" | ||
353 | "psraw $4, %%mm1 \n\t" | ||
354 | "pxor %%mm2, %%mm0 \n\t" | ||
355 | "pxor %%mm3, %%mm1 \n\t" | ||
356 | "psubw %%mm2, %%mm0 \n\t" | ||
357 | "psubw %%mm3, %%mm1 \n\t" | ||
358 | "pandn %%mm0, %%mm4 \n\t" | ||
359 | "pandn %%mm1, %%mm5 \n\t" | ||
360 | "movq %%mm4, (%0, %%"FF_REG_a") \n\t" | ||
361 | "movq %%mm5, 8(%0, %%"FF_REG_a")\n\t" | ||
362 | |||
363 | "add $16, %%"FF_REG_a" \n\t" | ||
364 | "jng 1b \n\t" | ||
365 | ✗ | ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale), "g" (-2*nCoeffs) | |
366 | : "%"FF_REG_a, "memory" | ||
367 | ); | ||
368 | ✗ | block[0]= block0; | |
369 | //Note, we do not do mismatch control for intra as errors cannot accumulate | ||
370 | ✗ | } | |
371 | |||
372 | 5941 | static void dct_unquantize_mpeg2_inter_mmx(MpegEncContext *s, | |
373 | int16_t *block, int n, int qscale) | ||
374 | { | ||
375 | x86_reg nCoeffs; | ||
376 | const uint16_t *quant_matrix; | ||
377 | |||
378 | av_assert2(s->block_last_index[n]>=0); | ||
379 | |||
380 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 5941 times.
|
5941 | if (s->q_scale_type) qscale = ff_mpeg2_non_linear_qscale[qscale]; |
381 | 5941 | else qscale <<= 1; | |
382 | |||
383 | 5941 | nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]; | |
384 | |||
385 | 5941 | quant_matrix = s->inter_matrix; | |
386 | 5941 | __asm__ volatile( | |
387 | "pcmpeqw %%mm7, %%mm7 \n\t" | ||
388 | "psrlq $48, %%mm7 \n\t" | ||
389 | "movd %2, %%mm6 \n\t" | ||
390 | "packssdw %%mm6, %%mm6 \n\t" | ||
391 | "packssdw %%mm6, %%mm6 \n\t" | ||
392 | "mov %3, %%"FF_REG_a" \n\t" | ||
393 | ".p2align 4 \n\t" | ||
394 | "1: \n\t" | ||
395 | "movq (%0, %%"FF_REG_a"), %%mm0 \n\t" | ||
396 | "movq 8(%0, %%"FF_REG_a"), %%mm1\n\t" | ||
397 | "movq (%1, %%"FF_REG_a"), %%mm4 \n\t" | ||
398 | "movq 8(%1, %%"FF_REG_a"), %%mm5\n\t" | ||
399 | "pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i] | ||
400 | "pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i] | ||
401 | "pxor %%mm2, %%mm2 \n\t" | ||
402 | "pxor %%mm3, %%mm3 \n\t" | ||
403 | "pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 | ||
404 | "pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? -1 : 0 | ||
405 | "pxor %%mm2, %%mm0 \n\t" | ||
406 | "pxor %%mm3, %%mm1 \n\t" | ||
407 | "psubw %%mm2, %%mm0 \n\t" // abs(block[i]) | ||
408 | "psubw %%mm3, %%mm1 \n\t" // abs(block[i]) | ||
409 | "paddw %%mm0, %%mm0 \n\t" // abs(block[i])*2 | ||
410 | "paddw %%mm1, %%mm1 \n\t" // abs(block[i])*2 | ||
411 | "pmullw %%mm4, %%mm0 \n\t" // abs(block[i])*2*q | ||
412 | "pmullw %%mm5, %%mm1 \n\t" // abs(block[i])*2*q | ||
413 | "paddw %%mm4, %%mm0 \n\t" // (abs(block[i])*2 + 1)*q | ||
414 | "paddw %%mm5, %%mm1 \n\t" // (abs(block[i])*2 + 1)*q | ||
415 | "pxor %%mm4, %%mm4 \n\t" | ||
416 | "pxor %%mm5, %%mm5 \n\t" // FIXME slow | ||
417 | "pcmpeqw (%0, %%"FF_REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0 | ||
418 | "pcmpeqw 8(%0, %%"FF_REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0 | ||
419 | "psrlw $5, %%mm0 \n\t" | ||
420 | "psrlw $5, %%mm1 \n\t" | ||
421 | "pxor %%mm2, %%mm0 \n\t" | ||
422 | "pxor %%mm3, %%mm1 \n\t" | ||
423 | "psubw %%mm2, %%mm0 \n\t" | ||
424 | "psubw %%mm3, %%mm1 \n\t" | ||
425 | "pandn %%mm0, %%mm4 \n\t" | ||
426 | "pandn %%mm1, %%mm5 \n\t" | ||
427 | "pxor %%mm4, %%mm7 \n\t" | ||
428 | "pxor %%mm5, %%mm7 \n\t" | ||
429 | "movq %%mm4, (%0, %%"FF_REG_a") \n\t" | ||
430 | "movq %%mm5, 8(%0, %%"FF_REG_a")\n\t" | ||
431 | |||
432 | "add $16, %%"FF_REG_a" \n\t" | ||
433 | "jng 1b \n\t" | ||
434 | "movd 124(%0, %3), %%mm0 \n\t" | ||
435 | "movq %%mm7, %%mm6 \n\t" | ||
436 | "psrlq $32, %%mm7 \n\t" | ||
437 | "pxor %%mm6, %%mm7 \n\t" | ||
438 | "movq %%mm7, %%mm6 \n\t" | ||
439 | "psrlq $16, %%mm7 \n\t" | ||
440 | "pxor %%mm6, %%mm7 \n\t" | ||
441 | "pslld $31, %%mm7 \n\t" | ||
442 | "psrlq $15, %%mm7 \n\t" | ||
443 | "pxor %%mm7, %%mm0 \n\t" | ||
444 | "movd %%mm0, 124(%0, %3) \n\t" | ||
445 | |||
446 | 5941 | ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale), "r" (-2*nCoeffs) | |
447 | : "%"FF_REG_a, "memory" | ||
448 | ); | ||
449 | 5941 | } | |
450 | |||
451 | #endif /* HAVE_MMX_INLINE */ | ||
452 | |||
453 | 683 | av_cold void ff_mpv_unquantize_init_x86(MPVUnquantDSPContext *s, int bitexact) | |
454 | { | ||
455 | #if HAVE_MMX_INLINE | ||
456 | 683 | int cpu_flags = av_get_cpu_flags(); | |
457 | |||
458 |
2/2✓ Branch 0 taken 86 times.
✓ Branch 1 taken 597 times.
|
683 | if (INLINE_MMX(cpu_flags)) { |
459 | 86 | s->dct_unquantize_h263_intra = dct_unquantize_h263_intra_mmx; | |
460 | 86 | s->dct_unquantize_h263_inter = dct_unquantize_h263_inter_mmx; | |
461 | 86 | s->dct_unquantize_mpeg1_intra = dct_unquantize_mpeg1_intra_mmx; | |
462 | 86 | s->dct_unquantize_mpeg1_inter = dct_unquantize_mpeg1_inter_mmx; | |
463 |
2/2✓ Branch 0 taken 81 times.
✓ Branch 1 taken 5 times.
|
86 | if (!bitexact) |
464 | 81 | s->dct_unquantize_mpeg2_intra = dct_unquantize_mpeg2_intra_mmx; | |
465 | 86 | s->dct_unquantize_mpeg2_inter = dct_unquantize_mpeg2_inter_mmx; | |
466 | } | ||
467 | #endif /* HAVE_MMX_INLINE */ | ||
468 | 683 | } | |
469 |