Libav
dsputil_mmx.c
Go to the documentation of this file.
1 /*
2  * MMX optimized DSP utils
3  * Copyright (c) 2000, 2001 Fabrice Bellard
4  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5  *
6  * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
7  *
8  * This file is part of Libav.
9  *
10  * Libav is free software; you can redistribute it and/or
11  * modify it under the terms of the GNU Lesser General Public
12  * License as published by the Free Software Foundation; either
13  * version 2.1 of the License, or (at your option) any later version.
14  *
15  * Libav is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18  * Lesser General Public License for more details.
19  *
20  * You should have received a copy of the GNU Lesser General Public
21  * License along with Libav; if not, write to the Free Software
22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23  */
24 
25 #include "config.h"
26 #include "libavutil/cpu.h"
27 #include "libavutil/x86/asm.h"
28 #include "constants.h"
29 #include "dsputil_x86.h"
30 
31 #if HAVE_INLINE_ASM
32 
33 void ff_put_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
34  int line_size)
35 {
36  const int16_t *p;
37  uint8_t *pix;
38 
39  /* read the pixels */
40  p = block;
41  pix = pixels;
42  /* unrolled loop */
43  __asm__ volatile (
44  "movq (%3), %%mm0 \n\t"
45  "movq 8(%3), %%mm1 \n\t"
46  "movq 16(%3), %%mm2 \n\t"
47  "movq 24(%3), %%mm3 \n\t"
48  "movq 32(%3), %%mm4 \n\t"
49  "movq 40(%3), %%mm5 \n\t"
50  "movq 48(%3), %%mm6 \n\t"
51  "movq 56(%3), %%mm7 \n\t"
52  "packuswb %%mm1, %%mm0 \n\t"
53  "packuswb %%mm3, %%mm2 \n\t"
54  "packuswb %%mm5, %%mm4 \n\t"
55  "packuswb %%mm7, %%mm6 \n\t"
56  "movq %%mm0, (%0) \n\t"
57  "movq %%mm2, (%0, %1) \n\t"
58  "movq %%mm4, (%0, %1, 2) \n\t"
59  "movq %%mm6, (%0, %2) \n\t"
60  :: "r"(pix), "r"((x86_reg)line_size), "r"((x86_reg)line_size * 3),
61  "r"(p)
62  : "memory");
63  pix += line_size * 4;
64  p += 32;
65 
66  // if here would be an exact copy of the code above
67  // compiler would generate some very strange code
68  // thus using "r"
69  __asm__ volatile (
70  "movq (%3), %%mm0 \n\t"
71  "movq 8(%3), %%mm1 \n\t"
72  "movq 16(%3), %%mm2 \n\t"
73  "movq 24(%3), %%mm3 \n\t"
74  "movq 32(%3), %%mm4 \n\t"
75  "movq 40(%3), %%mm5 \n\t"
76  "movq 48(%3), %%mm6 \n\t"
77  "movq 56(%3), %%mm7 \n\t"
78  "packuswb %%mm1, %%mm0 \n\t"
79  "packuswb %%mm3, %%mm2 \n\t"
80  "packuswb %%mm5, %%mm4 \n\t"
81  "packuswb %%mm7, %%mm6 \n\t"
82  "movq %%mm0, (%0) \n\t"
83  "movq %%mm2, (%0, %1) \n\t"
84  "movq %%mm4, (%0, %1, 2) \n\t"
85  "movq %%mm6, (%0, %2) \n\t"
86  :: "r"(pix), "r"((x86_reg)line_size), "r"((x86_reg)line_size * 3), "r"(p)
87  : "memory");
88 }
89 
90 #define put_signed_pixels_clamped_mmx_half(off) \
91  "movq "#off"(%2), %%mm1 \n\t" \
92  "movq 16 + "#off"(%2), %%mm2 \n\t" \
93  "movq 32 + "#off"(%2), %%mm3 \n\t" \
94  "movq 48 + "#off"(%2), %%mm4 \n\t" \
95  "packsswb 8 + "#off"(%2), %%mm1 \n\t" \
96  "packsswb 24 + "#off"(%2), %%mm2 \n\t" \
97  "packsswb 40 + "#off"(%2), %%mm3 \n\t" \
98  "packsswb 56 + "#off"(%2), %%mm4 \n\t" \
99  "paddb %%mm0, %%mm1 \n\t" \
100  "paddb %%mm0, %%mm2 \n\t" \
101  "paddb %%mm0, %%mm3 \n\t" \
102  "paddb %%mm0, %%mm4 \n\t" \
103  "movq %%mm1, (%0) \n\t" \
104  "movq %%mm2, (%0, %3) \n\t" \
105  "movq %%mm3, (%0, %3, 2) \n\t" \
106  "movq %%mm4, (%0, %1) \n\t"
107 
108 void ff_put_signed_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
109  int line_size)
110 {
111  x86_reg line_skip = line_size;
112  x86_reg line_skip3;
113 
114  __asm__ volatile (
115  "movq "MANGLE(ff_pb_80)", %%mm0 \n\t"
116  "lea (%3, %3, 2), %1 \n\t"
117  put_signed_pixels_clamped_mmx_half(0)
118  "lea (%0, %3, 4), %0 \n\t"
119  put_signed_pixels_clamped_mmx_half(64)
120  : "+&r"(pixels), "=&r"(line_skip3)
121  : "r"(block), "r"(line_skip)
122  : "memory");
123 }
124 
125 void ff_add_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
126  int line_size)
127 {
128  const int16_t *p;
129  uint8_t *pix;
130  int i;
131 
132  /* read the pixels */
133  p = block;
134  pix = pixels;
135  MOVQ_ZERO(mm7);
136  i = 4;
137  do {
138  __asm__ volatile (
139  "movq (%2), %%mm0 \n\t"
140  "movq 8(%2), %%mm1 \n\t"
141  "movq 16(%2), %%mm2 \n\t"
142  "movq 24(%2), %%mm3 \n\t"
143  "movq %0, %%mm4 \n\t"
144  "movq %1, %%mm6 \n\t"
145  "movq %%mm4, %%mm5 \n\t"
146  "punpcklbw %%mm7, %%mm4 \n\t"
147  "punpckhbw %%mm7, %%mm5 \n\t"
148  "paddsw %%mm4, %%mm0 \n\t"
149  "paddsw %%mm5, %%mm1 \n\t"
150  "movq %%mm6, %%mm5 \n\t"
151  "punpcklbw %%mm7, %%mm6 \n\t"
152  "punpckhbw %%mm7, %%mm5 \n\t"
153  "paddsw %%mm6, %%mm2 \n\t"
154  "paddsw %%mm5, %%mm3 \n\t"
155  "packuswb %%mm1, %%mm0 \n\t"
156  "packuswb %%mm3, %%mm2 \n\t"
157  "movq %%mm0, %0 \n\t"
158  "movq %%mm2, %1 \n\t"
159  : "+m"(*pix), "+m"(*(pix + line_size))
160  : "r"(p)
161  : "memory");
162  pix += line_size * 2;
163  p += 16;
164  } while (--i);
165 }
166 
167 #define CLEAR_BLOCKS(name, n) \
168 void name(int16_t *blocks) \
169 { \
170  __asm__ volatile ( \
171  "pxor %%mm7, %%mm7 \n\t" \
172  "mov %1, %%"REG_a" \n\t" \
173  "1: \n\t" \
174  "movq %%mm7, (%0, %%"REG_a") \n\t" \
175  "movq %%mm7, 8(%0, %%"REG_a") \n\t" \
176  "movq %%mm7, 16(%0, %%"REG_a") \n\t" \
177  "movq %%mm7, 24(%0, %%"REG_a") \n\t" \
178  "add $32, %%"REG_a" \n\t" \
179  "js 1b \n\t" \
180  :: "r"(((uint8_t *)blocks) + 128 * n), \
181  "i"(-128 * n) \
182  : "%"REG_a \
183  ); \
184 }
185 CLEAR_BLOCKS(ff_clear_blocks_mmx, 6)
186 CLEAR_BLOCKS(ff_clear_block_mmx, 1)
187 
188 void ff_clear_block_sse(int16_t *block)
189 {
190  __asm__ volatile (
191  "xorps %%xmm0, %%xmm0 \n"
192  "movaps %%xmm0, (%0) \n"
193  "movaps %%xmm0, 16(%0) \n"
194  "movaps %%xmm0, 32(%0) \n"
195  "movaps %%xmm0, 48(%0) \n"
196  "movaps %%xmm0, 64(%0) \n"
197  "movaps %%xmm0, 80(%0) \n"
198  "movaps %%xmm0, 96(%0) \n"
199  "movaps %%xmm0, 112(%0) \n"
200  :: "r"(block)
201  : "memory"
202  );
203 }
204 
205 void ff_clear_blocks_sse(int16_t *blocks)
206 {
207  __asm__ volatile (
208  "xorps %%xmm0, %%xmm0 \n"
209  "mov %1, %%"REG_a" \n"
210  "1: \n"
211  "movaps %%xmm0, (%0, %%"REG_a") \n"
212  "movaps %%xmm0, 16(%0, %%"REG_a") \n"
213  "movaps %%xmm0, 32(%0, %%"REG_a") \n"
214  "movaps %%xmm0, 48(%0, %%"REG_a") \n"
215  "movaps %%xmm0, 64(%0, %%"REG_a") \n"
216  "movaps %%xmm0, 80(%0, %%"REG_a") \n"
217  "movaps %%xmm0, 96(%0, %%"REG_a") \n"
218  "movaps %%xmm0, 112(%0, %%"REG_a") \n"
219  "add $128, %%"REG_a" \n"
220  "js 1b \n"
221  :: "r"(((uint8_t *)blocks) + 128 * 6),
222  "i"(-128 * 6)
223  : "%"REG_a
224  );
225 }
226 
227 void ff_add_bytes_mmx(uint8_t *dst, uint8_t *src, int w)
228 {
229  x86_reg i = 0;
230  __asm__ volatile (
231  "jmp 2f \n\t"
232  "1: \n\t"
233  "movq (%1, %0), %%mm0 \n\t"
234  "movq (%2, %0), %%mm1 \n\t"
235  "paddb %%mm0, %%mm1 \n\t"
236  "movq %%mm1, (%2, %0) \n\t"
237  "movq 8(%1, %0), %%mm0 \n\t"
238  "movq 8(%2, %0), %%mm1 \n\t"
239  "paddb %%mm0, %%mm1 \n\t"
240  "movq %%mm1, 8(%2, %0) \n\t"
241  "add $16, %0 \n\t"
242  "2: \n\t"
243  "cmp %3, %0 \n\t"
244  "js 1b \n\t"
245  : "+r"(i)
246  : "r"(src), "r"(dst), "r"((x86_reg)w - 15)
247  );
248  for ( ; i < w; i++)
249  dst[i + 0] += src[i + 0];
250 }
251 
252 /* Draw the edges of width 'w' of an image of size width, height
253  * this MMX version can only handle w == 8 || w == 16. */
254 void ff_draw_edges_mmx(uint8_t *buf, int wrap, int width, int height,
255  int w, int h, int sides)
256 {
257  uint8_t *ptr, *last_line;
258  int i;
259 
260  last_line = buf + (height - 1) * wrap;
261  /* left and right */
262  ptr = buf;
263  if (w == 8) {
264  __asm__ volatile (
265  "1: \n\t"
266  "movd (%0), %%mm0 \n\t"
267  "punpcklbw %%mm0, %%mm0 \n\t"
268  "punpcklwd %%mm0, %%mm0 \n\t"
269  "punpckldq %%mm0, %%mm0 \n\t"
270  "movq %%mm0, -8(%0) \n\t"
271  "movq -8(%0, %2), %%mm1 \n\t"
272  "punpckhbw %%mm1, %%mm1 \n\t"
273  "punpckhwd %%mm1, %%mm1 \n\t"
274  "punpckhdq %%mm1, %%mm1 \n\t"
275  "movq %%mm1, (%0, %2) \n\t"
276  "add %1, %0 \n\t"
277  "cmp %3, %0 \n\t"
278  "jb 1b \n\t"
279  : "+r"(ptr)
280  : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height)
281  );
282  } else {
283  __asm__ volatile (
284  "1: \n\t"
285  "movd (%0), %%mm0 \n\t"
286  "punpcklbw %%mm0, %%mm0 \n\t"
287  "punpcklwd %%mm0, %%mm0 \n\t"
288  "punpckldq %%mm0, %%mm0 \n\t"
289  "movq %%mm0, -8(%0) \n\t"
290  "movq %%mm0, -16(%0) \n\t"
291  "movq -8(%0, %2), %%mm1 \n\t"
292  "punpckhbw %%mm1, %%mm1 \n\t"
293  "punpckhwd %%mm1, %%mm1 \n\t"
294  "punpckhdq %%mm1, %%mm1 \n\t"
295  "movq %%mm1, (%0, %2) \n\t"
296  "movq %%mm1, 8(%0, %2) \n\t"
297  "add %1, %0 \n\t"
298  "cmp %3, %0 \n\t"
299  "jb 1b \n\t"
300  : "+r"(ptr)
301  : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height)
302  );
303  }
304 
305  /* top and bottom (and hopefully also the corners) */
306  if (sides & EDGE_TOP) {
307  for (i = 0; i < h; i += 4) {
308  ptr = buf - (i + 1) * wrap - w;
309  __asm__ volatile (
310  "1: \n\t"
311  "movq (%1, %0), %%mm0 \n\t"
312  "movq %%mm0, (%0) \n\t"
313  "movq %%mm0, (%0, %2) \n\t"
314  "movq %%mm0, (%0, %2, 2) \n\t"
315  "movq %%mm0, (%0, %3) \n\t"
316  "add $8, %0 \n\t"
317  "cmp %4, %0 \n\t"
318  "jb 1b \n\t"
319  : "+r"(ptr)
320  : "r"((x86_reg)buf - (x86_reg)ptr - w), "r"((x86_reg) -wrap),
321  "r"((x86_reg) -wrap * 3), "r"(ptr + width + 2 * w)
322  );
323  }
324  }
325 
326  if (sides & EDGE_BOTTOM) {
327  for (i = 0; i < h; i += 4) {
328  ptr = last_line + (i + 1) * wrap - w;
329  __asm__ volatile (
330  "1: \n\t"
331  "movq (%1, %0), %%mm0 \n\t"
332  "movq %%mm0, (%0) \n\t"
333  "movq %%mm0, (%0, %2) \n\t"
334  "movq %%mm0, (%0, %2, 2) \n\t"
335  "movq %%mm0, (%0, %3) \n\t"
336  "add $8, %0 \n\t"
337  "cmp %4, %0 \n\t"
338  "jb 1b \n\t"
339  : "+r"(ptr)
340  : "r"((x86_reg)last_line - (x86_reg)ptr - w),
341  "r"((x86_reg)wrap), "r"((x86_reg)wrap * 3),
342  "r"(ptr + width + 2 * w)
343  );
344  }
345  }
346 }
347 
348 void ff_gmc_mmx(uint8_t *dst, uint8_t *src,
349  int stride, int h, int ox, int oy,
350  int dxx, int dxy, int dyx, int dyy,
351  int shift, int r, int width, int height)
352 {
353  const int w = 8;
354  const int ix = ox >> (16 + shift);
355  const int iy = oy >> (16 + shift);
356  const int oxs = ox >> 4;
357  const int oys = oy >> 4;
358  const int dxxs = dxx >> 4;
359  const int dxys = dxy >> 4;
360  const int dyxs = dyx >> 4;
361  const int dyys = dyy >> 4;
362  const uint16_t r4[4] = { r, r, r, r };
363  const uint16_t dxy4[4] = { dxys, dxys, dxys, dxys };
364  const uint16_t dyy4[4] = { dyys, dyys, dyys, dyys };
365  const uint64_t shift2 = 2 * shift;
366  int x, y;
367 
368  const int dxw = (dxx - (1 << (16 + shift))) * (w - 1);
369  const int dyh = (dyy - (1 << (16 + shift))) * (h - 1);
370  const int dxh = dxy * (h - 1);
371  const int dyw = dyx * (w - 1);
372  if ( // non-constant fullpel offset (3% of blocks)
373  ((ox ^ (ox + dxw)) | (ox ^ (ox + dxh)) | (ox ^ (ox + dxw + dxh)) |
374  (oy ^ (oy + dyw)) | (oy ^ (oy + dyh)) | (oy ^ (oy + dyw + dyh))) >> (16 + shift)
375  // uses more than 16 bits of subpel mv (only at huge resolution)
376  || (dxx | dxy | dyx | dyy) & 15 ||
377  (unsigned)ix >= width - w ||
378  (unsigned)iy >= height - h) {
379  // FIXME could still use mmx for some of the rows
380  ff_gmc_c(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy,
381  shift, r, width, height);
382  return;
383  }
384 
385  src += ix + iy * stride;
386 
387  __asm__ volatile (
388  "movd %0, %%mm6 \n\t"
389  "pxor %%mm7, %%mm7 \n\t"
390  "punpcklwd %%mm6, %%mm6 \n\t"
391  "punpcklwd %%mm6, %%mm6 \n\t"
392  :: "r"(1<<shift)
393  );
394 
395  for (x = 0; x < w; x += 4) {
396  uint16_t dx4[4] = { oxs - dxys + dxxs * (x + 0),
397  oxs - dxys + dxxs * (x + 1),
398  oxs - dxys + dxxs * (x + 2),
399  oxs - dxys + dxxs * (x + 3) };
400  uint16_t dy4[4] = { oys - dyys + dyxs * (x + 0),
401  oys - dyys + dyxs * (x + 1),
402  oys - dyys + dyxs * (x + 2),
403  oys - dyys + dyxs * (x + 3) };
404 
405  for (y = 0; y < h; y++) {
406  __asm__ volatile (
407  "movq %0, %%mm4 \n\t"
408  "movq %1, %%mm5 \n\t"
409  "paddw %2, %%mm4 \n\t"
410  "paddw %3, %%mm5 \n\t"
411  "movq %%mm4, %0 \n\t"
412  "movq %%mm5, %1 \n\t"
413  "psrlw $12, %%mm4 \n\t"
414  "psrlw $12, %%mm5 \n\t"
415  : "+m"(*dx4), "+m"(*dy4)
416  : "m"(*dxy4), "m"(*dyy4)
417  );
418 
419  __asm__ volatile (
420  "movq %%mm6, %%mm2 \n\t"
421  "movq %%mm6, %%mm1 \n\t"
422  "psubw %%mm4, %%mm2 \n\t"
423  "psubw %%mm5, %%mm1 \n\t"
424  "movq %%mm2, %%mm0 \n\t"
425  "movq %%mm4, %%mm3 \n\t"
426  "pmullw %%mm1, %%mm0 \n\t" // (s - dx) * (s - dy)
427  "pmullw %%mm5, %%mm3 \n\t" // dx * dy
428  "pmullw %%mm5, %%mm2 \n\t" // (s - dx) * dy
429  "pmullw %%mm4, %%mm1 \n\t" // dx * (s - dy)
430 
431  "movd %4, %%mm5 \n\t"
432  "movd %3, %%mm4 \n\t"
433  "punpcklbw %%mm7, %%mm5 \n\t"
434  "punpcklbw %%mm7, %%mm4 \n\t"
435  "pmullw %%mm5, %%mm3 \n\t" // src[1, 1] * dx * dy
436  "pmullw %%mm4, %%mm2 \n\t" // src[0, 1] * (s - dx) * dy
437 
438  "movd %2, %%mm5 \n\t"
439  "movd %1, %%mm4 \n\t"
440  "punpcklbw %%mm7, %%mm5 \n\t"
441  "punpcklbw %%mm7, %%mm4 \n\t"
442  "pmullw %%mm5, %%mm1 \n\t" // src[1, 0] * dx * (s - dy)
443  "pmullw %%mm4, %%mm0 \n\t" // src[0, 0] * (s - dx) * (s - dy)
444  "paddw %5, %%mm1 \n\t"
445  "paddw %%mm3, %%mm2 \n\t"
446  "paddw %%mm1, %%mm0 \n\t"
447  "paddw %%mm2, %%mm0 \n\t"
448 
449  "psrlw %6, %%mm0 \n\t"
450  "packuswb %%mm0, %%mm0 \n\t"
451  "movd %%mm0, %0 \n\t"
452 
453  : "=m"(dst[x + y * stride])
454  : "m"(src[0]), "m"(src[1]),
455  "m"(src[stride]), "m"(src[stride + 1]),
456  "m"(*r4), "m"(shift2)
457  );
458  src += stride;
459  }
460  src += 4 - h * stride;
461  }
462 }
463 
464 void ff_vector_clipf_sse(float *dst, const float *src,
465  float min, float max, int len)
466 {
467  x86_reg i = (len - 16) * 4;
468  __asm__ volatile (
469  "movss %3, %%xmm4 \n\t"
470  "movss %4, %%xmm5 \n\t"
471  "shufps $0, %%xmm4, %%xmm4 \n\t"
472  "shufps $0, %%xmm5, %%xmm5 \n\t"
473  "1: \n\t"
474  "movaps (%2, %0), %%xmm0 \n\t" // 3/1 on intel
475  "movaps 16(%2, %0), %%xmm1 \n\t"
476  "movaps 32(%2, %0), %%xmm2 \n\t"
477  "movaps 48(%2, %0), %%xmm3 \n\t"
478  "maxps %%xmm4, %%xmm0 \n\t"
479  "maxps %%xmm4, %%xmm1 \n\t"
480  "maxps %%xmm4, %%xmm2 \n\t"
481  "maxps %%xmm4, %%xmm3 \n\t"
482  "minps %%xmm5, %%xmm0 \n\t"
483  "minps %%xmm5, %%xmm1 \n\t"
484  "minps %%xmm5, %%xmm2 \n\t"
485  "minps %%xmm5, %%xmm3 \n\t"
486  "movaps %%xmm0, (%1, %0) \n\t"
487  "movaps %%xmm1, 16(%1, %0) \n\t"
488  "movaps %%xmm2, 32(%1, %0) \n\t"
489  "movaps %%xmm3, 48(%1, %0) \n\t"
490  "sub $64, %0 \n\t"
491  "jge 1b \n\t"
492  : "+&r"(i)
493  : "r"(dst), "r"(src), "m"(min), "m"(max)
494  : "memory"
495  );
496 }
497 
498 #endif /* HAVE_INLINE_ASM */