| Line | Branch | Exec | Source | 
|---|---|---|---|
| 1 | /* | ||
| 2 | * This file is part of FFmpeg. | ||
| 3 | * | ||
| 4 | * FFmpeg is free software; you can redistribute it and/or | ||
| 5 | * modify it under the terms of the GNU Lesser General Public | ||
| 6 | * License as published by the Free Software Foundation; either | ||
| 7 | * version 2.1 of the License, or (at your option) any later version. | ||
| 8 | * | ||
| 9 | * FFmpeg is distributed in the hope that it will be useful, | ||
| 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
| 12 | * Lesser General Public License for more details. | ||
| 13 | * | ||
| 14 | * You should have received a copy of the GNU Lesser General Public | ||
| 15 | * License along with FFmpeg; if not, write to the Free Software | ||
| 16 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | ||
| 17 | */ | ||
| 18 | |||
| 19 | #include "config.h" | ||
| 20 | |||
| 21 | #include "pixelutils.h" | ||
| 22 | #include "cpu.h" | ||
| 23 | |||
| 24 | int ff_pixelutils_sad_8x8_mmxext(const uint8_t *src1, ptrdiff_t stride1, | ||
| 25 | const uint8_t *src2, ptrdiff_t stride2); | ||
| 26 | |||
| 27 | int ff_pixelutils_sad_16x16_sse2(const uint8_t *src1, ptrdiff_t stride1, | ||
| 28 | const uint8_t *src2, ptrdiff_t stride2); | ||
| 29 | int ff_pixelutils_sad_a_16x16_sse2(const uint8_t *src1, ptrdiff_t stride1, | ||
| 30 | const uint8_t *src2, ptrdiff_t stride2); | ||
| 31 | int ff_pixelutils_sad_u_16x16_sse2(const uint8_t *src1, ptrdiff_t stride1, | ||
| 32 | const uint8_t *src2, ptrdiff_t stride2); | ||
| 33 | |||
| 34 | int ff_pixelutils_sad_32x32_sse2(const uint8_t *src1, ptrdiff_t stride1, | ||
| 35 | const uint8_t *src2, ptrdiff_t stride2); | ||
| 36 | int ff_pixelutils_sad_a_32x32_sse2(const uint8_t *src1, ptrdiff_t stride1, | ||
| 37 | const uint8_t *src2, ptrdiff_t stride2); | ||
| 38 | int ff_pixelutils_sad_u_32x32_sse2(const uint8_t *src1, ptrdiff_t stride1, | ||
| 39 | const uint8_t *src2, ptrdiff_t stride2); | ||
| 40 | |||
| 41 | int ff_pixelutils_sad_32x32_avx2(const uint8_t *src1, ptrdiff_t stride1, | ||
| 42 | const uint8_t *src2, ptrdiff_t stride2); | ||
| 43 | int ff_pixelutils_sad_a_32x32_avx2(const uint8_t *src1, ptrdiff_t stride1, | ||
| 44 | const uint8_t *src2, ptrdiff_t stride2); | ||
| 45 | int ff_pixelutils_sad_u_32x32_avx2(const uint8_t *src1, ptrdiff_t stride1, | ||
| 46 | const uint8_t *src2, ptrdiff_t stride2); | ||
| 47 | |||
| 48 | 62 | void ff_pixelutils_sad_init_x86(av_pixelutils_sad_fn *sad, int aligned) | |
| 49 | { | ||
| 50 | 62 | int cpu_flags = av_get_cpu_flags(); | |
| 51 | |||
| 52 | // The best way to use SSE2 would be to do 2 SADs in parallel, | ||
| 53 | // but we'd have to modify the pixelutils API to return SIMD functions. | ||
| 54 | |||
| 55 | // It's probably not faster to shuffle data around | ||
| 56 | // to get two lines of 8 pixels into a single 16byte register, | ||
| 57 | // so just use the MMX 8x8 version even when SSE2 is available. | ||
| 58 | 
        2/2✓ Branch 0 taken 60 times. 
          ✓ Branch 1 taken 2 times. 
         | 
      62 | if (EXTERNAL_MMXEXT(cpu_flags)) { | 
| 59 | 60 | sad[2] = ff_pixelutils_sad_8x8_mmxext; | |
| 60 | } | ||
| 61 | |||
| 62 | 
        2/2✓ Branch 0 taken 60 times. 
          ✓ Branch 1 taken 2 times. 
         | 
      62 | if (EXTERNAL_SSE2(cpu_flags)) { | 
| 63 | 
        3/4✓ Branch 0 taken 20 times. 
          ✓ Branch 1 taken 20 times. 
          ✓ Branch 2 taken 20 times. 
          ✗ Branch 3 not taken. 
         | 
      60 | switch (aligned) { | 
| 64 | 20 | case 0: sad[3] = ff_pixelutils_sad_16x16_sse2; break; // src1 unaligned, src2 unaligned | |
| 65 | 20 | case 1: sad[3] = ff_pixelutils_sad_u_16x16_sse2; break; // src1 aligned, src2 unaligned | |
| 66 | 20 | case 2: sad[3] = ff_pixelutils_sad_a_16x16_sse2; break; // src1 aligned, src2 aligned | |
| 67 | } | ||
| 68 | } | ||
| 69 | |||
| 70 | 
        2/2✓ Branch 0 taken 60 times. 
          ✓ Branch 1 taken 2 times. 
         | 
      62 | if (EXTERNAL_SSE2(cpu_flags)) { | 
| 71 | 
        3/4✓ Branch 0 taken 20 times. 
          ✓ Branch 1 taken 20 times. 
          ✓ Branch 2 taken 20 times. 
          ✗ Branch 3 not taken. 
         | 
      60 | switch (aligned) { | 
| 72 | 20 | case 0: sad[4] = ff_pixelutils_sad_32x32_sse2; break; // src1 unaligned, src2 unaligned | |
| 73 | 20 | case 1: sad[4] = ff_pixelutils_sad_u_32x32_sse2; break; // src1 aligned, src2 unaligned | |
| 74 | 20 | case 2: sad[4] = ff_pixelutils_sad_a_32x32_sse2; break; // src1 aligned, src2 aligned | |
| 75 | } | ||
| 76 | } | ||
| 77 | |||
| 78 | 
        3/4✓ Branch 0 taken 60 times. 
          ✓ Branch 1 taken 2 times. 
          ✓ Branch 2 taken 60 times. 
          ✗ Branch 3 not taken. 
         | 
      62 | if (EXTERNAL_AVX2_FAST(cpu_flags)) { | 
| 79 | 
        3/4✓ Branch 0 taken 20 times. 
          ✓ Branch 1 taken 20 times. 
          ✓ Branch 2 taken 20 times. 
          ✗ Branch 3 not taken. 
         | 
      60 | switch (aligned) { | 
| 80 | 20 | case 0: sad[4] = ff_pixelutils_sad_32x32_avx2; break; // src1 unaligned, src2 unaligned | |
| 81 | 20 | case 1: sad[4] = ff_pixelutils_sad_u_32x32_avx2; break; // src1 aligned, src2 unaligned | |
| 82 | 20 | case 2: sad[4] = ff_pixelutils_sad_a_32x32_avx2; break; // src1 aligned, src2 aligned | |
| 83 | } | ||
| 84 | } | ||
| 85 | 62 | } | |
| 86 |