LCOV - code coverage report
Current view: top level - libavcodec - vp9block.c (source / functions) Hit Total Coverage
Test: coverage.info Lines: 782 805 97.1 %
Date: 2017-12-16 21:16:39 Functions: 12 12 100.0 %

          Line data    Source code
       1             : /*
       2             :  * VP9 compatible video decoder
       3             :  *
       4             :  * Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com>
       5             :  * Copyright (C) 2013 Clément Bœsch <u pkh me>
       6             :  *
       7             :  * This file is part of FFmpeg.
       8             :  *
       9             :  * FFmpeg is free software; you can redistribute it and/or
      10             :  * modify it under the terms of the GNU Lesser General Public
      11             :  * License as published by the Free Software Foundation; either
      12             :  * version 2.1 of the License, or (at your option) any later version.
      13             :  *
      14             :  * FFmpeg is distributed in the hope that it will be useful,
      15             :  * but WITHOUT ANY WARRANTY; without even the implied warranty of
      16             :  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
      17             :  * Lesser General Public License for more details.
      18             :  *
      19             :  * You should have received a copy of the GNU Lesser General Public
      20             :  * License along with FFmpeg; if not, write to the Free Software
      21             :  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
      22             :  */
      23             : 
      24             : #include "libavutil/avassert.h"
      25             : 
      26             : #include "avcodec.h"
      27             : #include "internal.h"
      28             : #include "videodsp.h"
      29             : #include "vp56.h"
      30             : #include "vp9.h"
      31             : #include "vp9data.h"
      32             : #include "vp9dec.h"
      33             : 
      34      819769 : static av_always_inline void setctx_2d(uint8_t *ptr, int w, int h,
      35             :                                        ptrdiff_t stride, int v)
      36             : {
      37      819769 :     switch (w) {
      38      628547 :     case 1:
      39             :         do {
      40      588737 :             *ptr = v;
      41      588737 :             ptr += stride;
      42      588737 :         } while (--h);
      43      548927 :         break;
      44      213070 :     case 2: {
      45      213070 :         int v16 = v * 0x0101;
      46             :         do {
      47      391645 :             AV_WN16A(ptr, v16);
      48      391645 :             ptr += stride;
      49      391645 :         } while (--h);
      50      213070 :         break;
      51             :     }
      52       46220 :     case 4: {
      53       46220 :         uint32_t v32 = v * 0x01010101;
      54             :         do {
      55      169328 :             AV_WN32A(ptr, v32);
      56      169328 :             ptr += stride;
      57      169328 :         } while (--h);
      58       46220 :         break;
      59             :     }
      60       11552 :     case 8: {
      61             : #if HAVE_FAST_64BIT
      62       11552 :         uint64_t v64 = v * 0x0101010101010101ULL;
      63             :         do {
      64       85852 :             AV_WN64A(ptr, v64);
      65       85852 :             ptr += stride;
      66       85852 :         } while (--h);
      67             : #else
      68             :         uint32_t v32 = v * 0x01010101;
      69             :         do {
      70             :             AV_WN32A(ptr,     v32);
      71             :             AV_WN32A(ptr + 4, v32);
      72             :             ptr += stride;
      73             :         } while (--h);
      74             : #endif
      75       11552 :         break;
      76             :     }
      77             :     }
      78      819769 : }
      79             : 
      80      778642 : static void decode_mode(VP9TileData *td)
      81             : {
      82             :     static const uint8_t left_ctx[N_BS_SIZES] = {
      83             :         0x0, 0x8, 0x0, 0x8, 0xc, 0x8, 0xc, 0xe, 0xc, 0xe, 0xf, 0xe, 0xf
      84             :     };
      85             :     static const uint8_t above_ctx[N_BS_SIZES] = {
      86             :         0x0, 0x0, 0x8, 0x8, 0x8, 0xc, 0xc, 0xc, 0xe, 0xe, 0xe, 0xf, 0xf
      87             :     };
      88             :     static const uint8_t max_tx_for_bl_bp[N_BS_SIZES] = {
      89             :         TX_32X32, TX_32X32, TX_32X32, TX_32X32, TX_16X16, TX_16X16,
      90             :         TX_16X16, TX_8X8,   TX_8X8,   TX_8X8,   TX_4X4,   TX_4X4,  TX_4X4
      91             :     };
      92      778642 :     VP9Context *s = td->s;
      93      778642 :     VP9Block *b = td->b;
      94      778642 :     int row = td->row, col = td->col, row7 = td->row7;
      95      778642 :     enum TxfmMode max_tx = max_tx_for_bl_bp[b->bs];
      96      778642 :     int bw4 = ff_vp9_bwh_tab[1][b->bs][0], w4 = FFMIN(s->cols - col, bw4);
      97      778642 :     int bh4 = ff_vp9_bwh_tab[1][b->bs][1], h4 = FFMIN(s->rows - row, bh4), y;
      98      778642 :     int have_a = row > 0, have_l = col > td->tile_col_start;
      99             :     int vref, filter_id;
     100             : 
     101      778642 :     if (!s->s.h.segmentation.enabled) {
     102      536919 :         b->seg_id = 0;
     103      241723 :     } else if (s->s.h.keyframe || s->s.h.intraonly) {
     104       15668 :         b->seg_id = !s->s.h.segmentation.update_map ? 0 :
     105        7834 :                     vp8_rac_get_tree(td->c, ff_vp9_segmentation_tree, s->s.h.segmentation.prob);
     106      296691 :     } else if (!s->s.h.segmentation.update_map ||
     107      125604 :                (s->s.h.segmentation.temporal &&
     108       62802 :                 vp56_rac_get_prob_branchy(td->c,
     109      125604 :                     s->s.h.segmentation.pred_prob[s->above_segpred_ctx[col] +
     110       62802 :                                     td->left_segpred_ctx[row7]]))) {
     111      447210 :         if (!s->s.h.errorres && s->s.frames[REF_FRAME_SEGMAP].segmentation_map) {
     112      220127 :             int pred = 8, x;
     113      220127 :             uint8_t *refsegmap = s->s.frames[REF_FRAME_SEGMAP].segmentation_map;
     114             : 
     115      220127 :             if (!s->s.frames[REF_FRAME_SEGMAP].uses_2pass)
     116      220127 :                 ff_thread_await_progress(&s->s.frames[REF_FRAME_SEGMAP].tf, row >> 3, 0);
     117      607409 :             for (y = 0; y < h4; y++) {
     118      387282 :                 int idx_base = (y + row) * 8 * s->sb_cols + col;
     119     1559454 :                 for (x = 0; x < w4; x++)
     120     1172172 :                     pred = FFMIN(pred, refsegmap[idx_base + x]);
     121             :             }
     122             :             av_assert1(pred < 8);
     123      220127 :             b->seg_id = pred;
     124             :         } else {
     125        6956 :             b->seg_id = 0;
     126             :         }
     127             : 
     128      227083 :         memset(&s->above_segpred_ctx[col], 1, w4);
     129      227083 :         memset(&td->left_segpred_ctx[row7], 1, h4);
     130             :     } else {
     131        6806 :         b->seg_id = vp8_rac_get_tree(td->c, ff_vp9_segmentation_tree,
     132        6806 :                                      s->s.h.segmentation.prob);
     133             : 
     134        6806 :         memset(&s->above_segpred_ctx[col], 0, w4);
     135        6806 :         memset(&td->left_segpred_ctx[row7], 0, h4);
     136             :     }
     137     1020365 :     if (s->s.h.segmentation.enabled &&
     138      412810 :         (s->s.h.segmentation.update_map || s->s.h.keyframe || s->s.h.intraonly)) {
     139      141272 :         setctx_2d(&s->s.frames[CUR_FRAME].segmentation_map[row * 8 * s->sb_cols + col],
     140      141272 :                   bw4, bh4, 8 * s->sb_cols, b->seg_id);
     141             :     }
     142             : 
     143     1020365 :     b->skip = s->s.h.segmentation.enabled &&
     144      241723 :         s->s.h.segmentation.feat[b->seg_id].skip_enabled;
     145      778642 :     if (!b->skip) {
     146      777841 :         int c = td->left_skip_ctx[row7] + s->above_skip_ctx[col];
     147      777841 :         b->skip = vp56_rac_get_prob(td->c, s->prob.p.skip[c]);
     148      777841 :         td->counts.skip[c][b->skip]++;
     149             :     }
     150             : 
     151      778642 :     if (s->s.h.keyframe || s->s.h.intraonly) {
     152      218046 :         b->intra = 1;
     153      560596 :     } else if (s->s.h.segmentation.enabled && s->s.h.segmentation.feat[b->seg_id].ref_enabled) {
     154         801 :         b->intra = !s->s.h.segmentation.feat[b->seg_id].ref_val;
     155             :     } else {
     156             :         int c, bit;
     157             : 
     158      559795 :         if (have_a && have_l) {
     159      516558 :             c = s->above_intra_ctx[col] + td->left_intra_ctx[row7];
     160      516558 :             c += (c == 2);
     161             :         } else {
     162       70417 :             c = have_a ? 2 * s->above_intra_ctx[col] :
     163       27180 :                 have_l ? 2 * td->left_intra_ctx[row7] : 0;
     164             :         }
     165      559795 :         bit = vp56_rac_get_prob(td->c, s->prob.p.intra[c]);
     166      559795 :         td->counts.intra[c][bit]++;
     167      559795 :         b->intra = !bit;
     168             :     }
     169             : 
     170     1140142 :     if ((b->intra || !b->skip) && s->s.h.txfmmode == TX_SWITCHABLE) {
     171             :         int c;
     172      361500 :         if (have_a) {
     173      338644 :             if (have_l) {
     174      922662 :                 c = (s->above_skip_ctx[col] ? max_tx :
     175      594388 :                      s->above_txfm_ctx[col]) +
     176      585730 :                     (td->left_skip_ctx[row7] ? max_tx :
     177      585730 :                      td->left_txfm_ctx[row7]) > max_tx;
     178             :             } else {
     179       19115 :                 c = s->above_skip_ctx[col] ? 1 :
     180        8745 :                     (s->above_txfm_ctx[col] * 2 > max_tx);
     181             :             }
     182       22856 :         } else if (have_l) {
     183       40649 :             c = td->left_skip_ctx[row7] ? 1 :
     184       19053 :                 (td->left_txfm_ctx[row7] * 2 > max_tx);
     185             :         } else {
     186        1260 :             c = 1;
     187             :         }
     188      361500 :         switch (max_tx) {
     189        9077 :         case TX_32X32:
     190        9077 :             b->tx = vp56_rac_get_prob(td->c, s->prob.p.tx32p[c][0]);
     191        9077 :             if (b->tx) {
     192        8944 :                 b->tx += vp56_rac_get_prob(td->c, s->prob.p.tx32p[c][1]);
     193        8944 :                 if (b->tx == 2)
     194        7915 :                     b->tx += vp56_rac_get_prob(td->c, s->prob.p.tx32p[c][2]);
     195             :             }
     196        9077 :             td->counts.tx32p[c][b->tx]++;
     197        9077 :             break;
     198       46008 :         case TX_16X16:
     199       46008 :             b->tx = vp56_rac_get_prob(td->c, s->prob.p.tx16p[c][0]);
     200       46008 :             if (b->tx)
     201       43203 :                 b->tx += vp56_rac_get_prob(td->c, s->prob.p.tx16p[c][1]);
     202       46008 :             td->counts.tx16p[c][b->tx]++;
     203       46008 :             break;
     204      193109 :         case TX_8X8:
     205      193109 :             b->tx = vp56_rac_get_prob(td->c, s->prob.p.tx8p[c]);
     206      193109 :             td->counts.tx8p[c][b->tx]++;
     207      193109 :             break;
     208      113306 :         case TX_4X4:
     209      113306 :             b->tx = TX_4X4;
     210      113306 :             break;
     211             :         }
     212             :     } else {
     213      417142 :         b->tx = FFMIN(max_tx, s->s.h.txfmmode);
     214             :     }
     215             : 
     216      996688 :     if (s->s.h.keyframe || s->s.h.intraonly) {
     217      218046 :         uint8_t *a = &s->above_mode_ctx[col * 2];
     218      218046 :         uint8_t *l = &td->left_mode_ctx[(row7) << 1];
     219             : 
     220      218046 :         b->comp = 0;
     221      218046 :         if (b->bs > BS_8x8) {
     222             :             // FIXME the memory storage intermediates here aren't really
     223             :             // necessary, they're just there to make the code slightly
     224             :             // simpler for now
     225       78372 :             b->mode[0] =
     226       78372 :             a[0]       = vp8_rac_get_tree(td->c, ff_vp9_intramode_tree,
     227       78372 :                                           ff_vp9_default_kf_ymode_probs[a[0]][l[0]]);
     228       78372 :             if (b->bs != BS_8x4) {
     229       56065 :                 b->mode[1] = vp8_rac_get_tree(td->c, ff_vp9_intramode_tree,
     230       56065 :                                               ff_vp9_default_kf_ymode_probs[a[1]][b->mode[0]]);
     231       56065 :                 l[0]       =
     232       56065 :                 a[1]       = b->mode[1];
     233             :             } else {
     234       22307 :                 l[0]       =
     235       44614 :                 a[1]       =
     236       44614 :                 b->mode[1] = b->mode[0];
     237             :             }
     238       78372 :             if (b->bs != BS_4x8) {
     239       63769 :                 b->mode[2] =
     240       63769 :                 a[0]       = vp8_rac_get_tree(td->c, ff_vp9_intramode_tree,
     241       63769 :                                               ff_vp9_default_kf_ymode_probs[a[0]][l[1]]);
     242       63769 :                 if (b->bs != BS_8x4) {
     243       41462 :                     b->mode[3] = vp8_rac_get_tree(td->c, ff_vp9_intramode_tree,
     244       41462 :                                                   ff_vp9_default_kf_ymode_probs[a[1]][b->mode[2]]);
     245       82924 :                     l[1]       =
     246       82924 :                     a[1]       = b->mode[3];
     247             :                 } else {
     248       44614 :                     l[1]       =
     249       66921 :                     a[1]       =
     250       44614 :                     b->mode[3] = b->mode[2];
     251             :                 }
     252             :             } else {
     253       14603 :                 b->mode[2] = b->mode[0];
     254       29206 :                 l[1]       =
     255       43809 :                 a[1]       =
     256       29206 :                 b->mode[3] = b->mode[1];
     257             :             }
     258             :         } else {
     259      139674 :             b->mode[0] = vp8_rac_get_tree(td->c, ff_vp9_intramode_tree,
     260      139674 :                                           ff_vp9_default_kf_ymode_probs[*a][*l]);
     261      139674 :             b->mode[3] =
     262      139674 :             b->mode[2] =
     263      139674 :             b->mode[1] = b->mode[0];
     264             :             // FIXME this can probably be optimized
     265      139674 :             memset(a, b->mode[0], ff_vp9_bwh_tab[0][b->bs][0]);
     266      139674 :             memset(l, b->mode[0], ff_vp9_bwh_tab[0][b->bs][1]);
     267             :         }
     268      218046 :         b->uvmode = vp8_rac_get_tree(td->c, ff_vp9_intramode_tree,
     269      218046 :                                      ff_vp9_default_kf_uvmode_probs[b->mode[3]]);
     270      560596 :     } else if (b->intra) {
     271       32118 :         b->comp = 0;
     272       32118 :         if (b->bs > BS_8x8) {
     273        1662 :             b->mode[0] = vp8_rac_get_tree(td->c, ff_vp9_intramode_tree,
     274        1662 :                                           s->prob.p.y_mode[0]);
     275        1662 :             td->counts.y_mode[0][b->mode[0]]++;
     276        1662 :             if (b->bs != BS_8x4) {
     277        1186 :                 b->mode[1] = vp8_rac_get_tree(td->c, ff_vp9_intramode_tree,
     278        1186 :                                               s->prob.p.y_mode[0]);
     279        1186 :                 td->counts.y_mode[0][b->mode[1]]++;
     280             :             } else {
     281         476 :                 b->mode[1] = b->mode[0];
     282             :             }
     283        1662 :             if (b->bs != BS_4x8) {
     284        1156 :                 b->mode[2] = vp8_rac_get_tree(td->c, ff_vp9_intramode_tree,
     285        1156 :                                               s->prob.p.y_mode[0]);
     286        1156 :                 td->counts.y_mode[0][b->mode[2]]++;
     287        1156 :                 if (b->bs != BS_8x4) {
     288         680 :                     b->mode[3] = vp8_rac_get_tree(td->c, ff_vp9_intramode_tree,
     289         680 :                                                   s->prob.p.y_mode[0]);
     290         680 :                     td->counts.y_mode[0][b->mode[3]]++;
     291             :                 } else {
     292         476 :                     b->mode[3] = b->mode[2];
     293             :                 }
     294             :             } else {
     295         506 :                 b->mode[2] = b->mode[0];
     296         506 :                 b->mode[3] = b->mode[1];
     297             :             }
     298             :         } else {
     299             :             static const uint8_t size_group[10] = {
     300             :                 3, 3, 3, 3, 2, 2, 2, 1, 1, 1
     301             :             };
     302       30456 :             int sz = size_group[b->bs];
     303             : 
     304       30456 :             b->mode[0] = vp8_rac_get_tree(td->c, ff_vp9_intramode_tree,
     305       30456 :                                           s->prob.p.y_mode[sz]);
     306       30456 :             b->mode[1] =
     307       30456 :             b->mode[2] =
     308       30456 :             b->mode[3] = b->mode[0];
     309       30456 :             td->counts.y_mode[sz][b->mode[3]]++;
     310             :         }
     311       32118 :         b->uvmode = vp8_rac_get_tree(td->c, ff_vp9_intramode_tree,
     312       32118 :                                      s->prob.p.uv_mode[b->mode[3]]);
     313       32118 :         td->counts.uv_mode[b->mode[3]][b->uvmode]++;
     314             :     } else {
     315             :         static const uint8_t inter_mode_ctx_lut[14][14] = {
     316             :             { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
     317             :             { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
     318             :             { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
     319             :             { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
     320             :             { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
     321             :             { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
     322             :             { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
     323             :             { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
     324             :             { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
     325             :             { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
     326             :             { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 2, 2, 1, 3 },
     327             :             { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 2, 2, 1, 3 },
     328             :             { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 1, 1, 0, 3 },
     329             :             { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 3, 3, 3, 4 },
     330             :         };
     331             : 
     332      528478 :         if (s->s.h.segmentation.enabled && s->s.h.segmentation.feat[b->seg_id].ref_enabled) {
     333             :             av_assert2(s->s.h.segmentation.feat[b->seg_id].ref_val != 0);
     334         801 :             b->comp = 0;
     335         801 :             b->ref[0] = s->s.h.segmentation.feat[b->seg_id].ref_val - 1;
     336             :         } else {
     337             :             // read comp_pred flag
     338      527677 :             if (s->s.h.comppredmode != PRED_SWITCHABLE) {
     339      343445 :                 b->comp = s->s.h.comppredmode == PRED_COMPREF;
     340             :             } else {
     341             :                 int c;
     342             : 
     343             :                 // FIXME add intra as ref=0xff (or -1) to make these easier?
     344      184232 :                 if (have_a) {
     345      179448 :                     if (have_l) {
     346      176369 :                         if (s->above_comp_ctx[col] && td->left_comp_ctx[row7]) {
     347       18517 :                             c = 4;
     348      157852 :                         } else if (s->above_comp_ctx[col]) {
     349       19316 :                             c = 2 + (td->left_intra_ctx[row7] ||
     350        9604 :                                      td->left_ref_ctx[row7] == s->s.h.fixcompref);
     351      148140 :                         } else if (td->left_comp_ctx[row7]) {
     352       16690 :                             c = 2 + (s->above_intra_ctx[col] ||
     353        8289 :                                      s->above_ref_ctx[col] == s->s.h.fixcompref);
     354             :                         } else {
     355      418349 :                             c = (!s->above_intra_ctx[col] &&
     356      138871 :                                  s->above_ref_ctx[col] == s->s.h.fixcompref) ^
     357      277654 :                                 (!td->left_intra_ctx[row7] &&
     358      137915 :                                  td->left_ref_ctx[row & 7] == s->s.h.fixcompref);
     359             :                         }
     360             :                     } else {
     361        5872 :                         c = s->above_comp_ctx[col] ? 3 :
     362        2793 :                         (!s->above_intra_ctx[col] && s->above_ref_ctx[col] == s->s.h.fixcompref);
     363             :                     }
     364        4784 :                 } else if (have_l) {
     365        8314 :                     c = td->left_comp_ctx[row7] ? 3 :
     366        3840 :                     (!td->left_intra_ctx[row7] && td->left_ref_ctx[row7] == s->s.h.fixcompref);
     367             :                 } else {
     368         310 :                     c = 1;
     369             :                 }
     370      184232 :                 b->comp = vp56_rac_get_prob(td->c, s->prob.p.comp[c]);
     371      184232 :                 td->counts.comp[c][b->comp]++;
     372             :             }
     373             : 
     374             :             // read actual references
     375             :             // FIXME probably cache a few variables here to prevent repetitive
     376             :             // memory accesses below
     377      527677 :             if (b->comp) { /* two references */
     378       27071 :                 int fix_idx = s->s.h.signbias[s->s.h.fixcompref], var_idx = !fix_idx, c, bit;
     379             : 
     380       27071 :                 b->ref[fix_idx] = s->s.h.fixcompref;
     381             :                 // FIXME can this codeblob be replaced by some sort of LUT?
     382       27071 :                 if (have_a) {
     383       26388 :                     if (have_l) {
     384       26087 :                         if (s->above_intra_ctx[col]) {
     385          68 :                             if (td->left_intra_ctx[row7]) {
     386          14 :                                 c = 2;
     387             :                             } else {
     388          54 :                                 c = 1 + 2 * (td->left_ref_ctx[row7] != s->s.h.varcompref[1]);
     389             :                             }
     390       26019 :                         } else if (td->left_intra_ctx[row7]) {
     391          84 :                             c = 1 + 2 * (s->above_ref_ctx[col] != s->s.h.varcompref[1]);
     392             :                         } else {
     393       25935 :                             int refl = td->left_ref_ctx[row7], refa = s->above_ref_ctx[col];
     394             : 
     395       25935 :                             if (refl == refa && refa == s->s.h.varcompref[1]) {
     396        2446 :                                 c = 0;
     397       23489 :                             } else if (!td->left_comp_ctx[row7] && !s->above_comp_ctx[col]) {
     398        6014 :                                 if ((refa == s->s.h.fixcompref && refl == s->s.h.varcompref[0]) ||
     399        2500 :                                     (refl == s->s.h.fixcompref && refa == s->s.h.varcompref[0])) {
     400         741 :                                     c = 4;
     401             :                                 } else {
     402        1378 :                                     c = (refa == refl) ? 3 : 1;
     403             :                                 }
     404       21370 :                             } else if (!td->left_comp_ctx[row7]) {
     405        2700 :                                 if (refa == s->s.h.varcompref[1] && refl != s->s.h.varcompref[1]) {
     406         443 :                                     c = 1;
     407             :                                 } else {
     408        4618 :                                     c = (refl == s->s.h.varcompref[1] &&
     409        2361 :                                          refa != s->s.h.varcompref[1]) ? 2 : 4;
     410             :                                 }
     411       18670 :                             } else if (!s->above_comp_ctx[col]) {
     412        3194 :                                 if (refl == s->s.h.varcompref[1] && refa != s->s.h.varcompref[1]) {
     413         442 :                                     c = 1;
     414             :                                 } else {
     415        5689 :                                     c = (refa == s->s.h.varcompref[1] &&
     416        2937 :                                          refl != s->s.h.varcompref[1]) ? 2 : 4;
     417             :                                 }
     418             :                             } else {
     419       15476 :                                 c = (refl == refa) ? 4 : 2;
     420             :                             }
     421             :                         }
     422             :                     } else {
     423         301 :                         if (s->above_intra_ctx[col]) {
     424           1 :                             c = 2;
     425         300 :                         } else if (s->above_comp_ctx[col]) {
     426         130 :                             c = 4 * (s->above_ref_ctx[col] != s->s.h.varcompref[1]);
     427             :                         } else {
     428         170 :                             c = 3 * (s->above_ref_ctx[col] != s->s.h.varcompref[1]);
     429             :                         }
     430             :                     }
     431         683 :                 } else if (have_l) {
     432         665 :                     if (td->left_intra_ctx[row7]) {
     433           4 :                         c = 2;
     434         661 :                     } else if (td->left_comp_ctx[row7]) {
     435         397 :                         c = 4 * (td->left_ref_ctx[row7] != s->s.h.varcompref[1]);
     436             :                     } else {
     437         264 :                         c = 3 * (td->left_ref_ctx[row7] != s->s.h.varcompref[1]);
     438             :                     }
     439             :                 } else {
     440          18 :                     c = 2;
     441             :                 }
     442       27071 :                 bit = vp56_rac_get_prob(td->c, s->prob.p.comp_ref[c]);
     443       27071 :                 b->ref[var_idx] = s->s.h.varcompref[bit];
     444       27071 :                 td->counts.comp_ref[c][bit]++;
     445             :             } else /* single reference */ {
     446             :                 int bit, c;
     447             : 
     448      500606 :                 if (have_a && !s->above_intra_ctx[col]) {
     449      927304 :                     if (have_l && !td->left_intra_ctx[row7]) {
     450      877172 :                         if (td->left_comp_ctx[row7]) {
     451        6151 :                             if (s->above_comp_ctx[col]) {
     452        2088 :                                 c = 1 + (!s->s.h.fixcompref || !td->left_ref_ctx[row7] ||
     453         529 :                                          !s->above_ref_ctx[col]);
     454             :                             } else {
     455        9184 :                                 c = (3 * !s->above_ref_ctx[col]) +
     456        4592 :                                     (!s->s.h.fixcompref || !td->left_ref_ctx[row7]);
     457             :                             }
     458      432435 :                         } else if (s->above_comp_ctx[col]) {
     459       13284 :                             c = (3 * !td->left_ref_ctx[row7]) +
     460        6642 :                                 (!s->s.h.fixcompref || !s->above_ref_ctx[col]);
     461             :                         } else {
     462      425793 :                             c = 2 * !td->left_ref_ctx[row7] + 2 * !s->above_ref_ctx[col];
     463             :                         }
     464       25066 :                     } else if (s->above_intra_ctx[col]) {
     465           0 :                         c = 2;
     466       25066 :                     } else if (s->above_comp_ctx[col]) {
     467         229 :                         c = 1 + (!s->s.h.fixcompref || !s->above_ref_ctx[col]);
     468             :                     } else {
     469       24837 :                         c = 4 * (!s->above_ref_ctx[col]);
     470             :                     }
     471       36954 :                 } else if (have_l && !td->left_intra_ctx[row7]) {
     472       63872 :                     if (td->left_intra_ctx[row7]) {
     473           0 :                         c = 2;
     474       31936 :                     } else if (td->left_comp_ctx[row7]) {
     475         327 :                         c = 1 + (!s->s.h.fixcompref || !td->left_ref_ctx[row7]);
     476             :                     } else {
     477       31609 :                         c = 4 * (!td->left_ref_ctx[row7]);
     478             :                     }
     479             :                 } else {
     480        5018 :                     c = 2;
     481             :                 }
     482      500606 :                 bit = vp56_rac_get_prob(td->c, s->prob.p.single_ref[c][0]);
     483      500606 :                 td->counts.single_ref[c][0][bit]++;
     484      500606 :                 if (!bit) {
     485      400344 :                     b->ref[0] = 0;
     486             :                 } else {
     487             :                     // FIXME can this codeblob be replaced by some sort of LUT?
     488      100262 :                     if (have_a) {
     489       95197 :                         if (have_l) {
     490       92065 :                             if (td->left_intra_ctx[row7]) {
     491        3386 :                                 if (s->above_intra_ctx[col]) {
     492         772 :                                     c = 2;
     493        2614 :                                 } else if (s->above_comp_ctx[col]) {
     494          98 :                                     c = 1 + 2 * (s->s.h.fixcompref == 1 ||
     495          49 :                                                  s->above_ref_ctx[col] == 1);
     496        2565 :                                 } else if (!s->above_ref_ctx[col]) {
     497         704 :                                     c = 3;
     498             :                                 } else {
     499        1861 :                                     c = 4 * (s->above_ref_ctx[col] == 1);
     500             :                                 }
     501       88679 :                             } else if (s->above_intra_ctx[col]) {
     502        1920 :                                 if (td->left_intra_ctx[row7]) {
     503           0 :                                     c = 2;
     504        1920 :                                 } else if (td->left_comp_ctx[row7]) {
     505          90 :                                     c = 1 + 2 * (s->s.h.fixcompref == 1 ||
     506          45 :                                                  td->left_ref_ctx[row7] == 1);
     507        1875 :                                 } else if (!td->left_ref_ctx[row7]) {
     508         556 :                                     c = 3;
     509             :                                 } else {
     510        1319 :                                     c = 4 * (td->left_ref_ctx[row7] == 1);
     511             :                                 }
     512       86759 :                             } else if (s->above_comp_ctx[col]) {
     513        4360 :                                 if (td->left_comp_ctx[row7]) {
     514         845 :                                     if (td->left_ref_ctx[row7] == s->above_ref_ctx[col]) {
     515        1448 :                                         c = 3 * (s->s.h.fixcompref == 1 ||
     516         724 :                                                  td->left_ref_ctx[row7] == 1);
     517             :                                     } else {
     518         121 :                                         c = 2;
     519             :                                     }
     520        3515 :                                 } else if (!td->left_ref_ctx[row7]) {
     521         658 :                                     c = 1 + 2 * (s->s.h.fixcompref == 1 ||
     522         329 :                                                  s->above_ref_ctx[col] == 1);
     523             :                                 } else {
     524        6372 :                                     c = 3 * (td->left_ref_ctx[row7] == 1) +
     525        3186 :                                     (s->s.h.fixcompref == 1 || s->above_ref_ctx[col] == 1);
     526             :                                 }
     527       82399 :                             } else if (td->left_comp_ctx[row7]) {
     528        1733 :                                 if (!s->above_ref_ctx[col]) {
     529         660 :                                     c = 1 + 2 * (s->s.h.fixcompref == 1 ||
     530         330 :                                                  td->left_ref_ctx[row7] == 1);
     531             :                                 } else {
     532        2806 :                                     c = 3 * (s->above_ref_ctx[col] == 1) +
     533        1403 :                                     (s->s.h.fixcompref == 1 || td->left_ref_ctx[row7] == 1);
     534             :                                 }
     535       80666 :                             } else if (!s->above_ref_ctx[col]) {
     536       18174 :                                 if (!td->left_ref_ctx[row7]) {
     537        7482 :                                     c = 3;
     538             :                                 } else {
     539       10692 :                                     c = 4 * (td->left_ref_ctx[row7] == 1);
     540             :                                 }
     541       62492 :                             } else if (!td->left_ref_ctx[row7]) {
     542        9216 :                                 c = 4 * (s->above_ref_ctx[col] == 1);
     543             :                             } else {
     544      106552 :                                 c = 2 * (td->left_ref_ctx[row7] == 1) +
     545       53276 :                                     2 * (s->above_ref_ctx[col] == 1);
     546             :                             }
     547             :                         } else {
     548        6195 :                             if (s->above_intra_ctx[col] ||
     549        6021 :                                 (!s->above_comp_ctx[col] && !s->above_ref_ctx[col])) {
     550         765 :                                 c = 2;
     551        2367 :                             } else if (s->above_comp_ctx[col]) {
     552         105 :                                 c = 3 * (s->s.h.fixcompref == 1 || s->above_ref_ctx[col] == 1);
     553             :                             } else {
     554        2262 :                                 c = 4 * (s->above_ref_ctx[col] == 1);
     555             :                             }
     556             :                         }
     557        5065 :                     } else if (have_l) {
     558        9160 :                         if (td->left_intra_ctx[row7] ||
     559        8891 :                             (!td->left_comp_ctx[row7] && !td->left_ref_ctx[row7])) {
     560         979 :                             c = 2;
     561        3679 :                         } else if (td->left_comp_ctx[row7]) {
     562         113 :                             c = 3 * (s->s.h.fixcompref == 1 || td->left_ref_ctx[row7] == 1);
     563             :                         } else {
     564        3566 :                             c = 4 * (td->left_ref_ctx[row7] == 1);
     565             :                         }
     566             :                     } else {
     567         407 :                         c = 2;
     568             :                     }
     569      100262 :                     bit = vp56_rac_get_prob(td->c, s->prob.p.single_ref[c][1]);
     570      100262 :                     td->counts.single_ref[c][1][bit]++;
     571      100262 :                     b->ref[0] = 1 + bit;
     572             :                 }
     573             :             }
     574             :         }
     575             : 
     576      528478 :         if (b->bs <= BS_8x8) {
     577      478633 :             if (s->s.h.segmentation.enabled && s->s.h.segmentation.feat[b->seg_id].skip_enabled) {
     578         801 :                 b->mode[0] =
     579         801 :                 b->mode[1] =
     580         801 :                 b->mode[2] =
     581         801 :                 b->mode[3] = ZEROMV;
     582             :             } else {
     583             :                 static const uint8_t off[10] = {
     584             :                     3, 0, 0, 1, 0, 0, 0, 0, 0, 0
     585             :                 };
     586             : 
     587             :                 // FIXME this needs to use the LUT tables from find_ref_mvs
     588             :                 // because not all are -1,0/0,-1
     589      955664 :                 int c = inter_mode_ctx_lut[s->above_mode_ctx[col + off[b->bs]]]
     590      477832 :                                           [td->left_mode_ctx[row7 + off[b->bs]]];
     591             : 
     592      477832 :                 b->mode[0] = vp8_rac_get_tree(td->c, ff_vp9_inter_mode_tree,
     593      477832 :                                               s->prob.p.mv_mode[c]);
     594      477832 :                 b->mode[1] =
     595      477832 :                 b->mode[2] =
     596      477832 :                 b->mode[3] = b->mode[0];
     597      477832 :                 td->counts.mv_mode[c][b->mode[0] - 10]++;
     598             :             }
     599             :         }
     600             : 
     601      528478 :         if (s->s.h.filtermode == FILTER_SWITCHABLE) {
     602             :             int c;
     603             : 
     604      427141 :             if (have_a && s->above_mode_ctx[col] >= NEARESTMV) {
     605      787216 :                 if (have_l && td->left_mode_ctx[row7] >= NEARESTMV) {
     606      742652 :                     c = s->above_filter_ctx[col] == td->left_filter_ctx[row7] ?
     607      371326 :                         td->left_filter_ctx[row7] : 3;
     608             :                 } else {
     609       22282 :                     c = s->above_filter_ctx[col];
     610             :                 }
     611       33533 :             } else if (have_l && td->left_mode_ctx[row7] >= NEARESTMV) {
     612       29136 :                 c = td->left_filter_ctx[row7];
     613             :             } else {
     614        4397 :                 c = 3;
     615             :             }
     616             : 
     617      427141 :             filter_id = vp8_rac_get_tree(td->c, ff_vp9_filter_tree,
     618      427141 :                                          s->prob.p.filter[c]);
     619      427141 :             td->counts.filter[c][filter_id]++;
     620      427141 :             b->filter = ff_vp9_filter_lut[filter_id];
     621             :         } else {
     622      101337 :             b->filter = s->s.h.filtermode;
     623             :         }
     624             : 
     625      528478 :         if (b->bs > BS_8x8) {
     626       49845 :             int c = inter_mode_ctx_lut[s->above_mode_ctx[col]][td->left_mode_ctx[row7]];
     627             : 
     628       49845 :             b->mode[0] = vp8_rac_get_tree(td->c, ff_vp9_inter_mode_tree,
     629       49845 :                                           s->prob.p.mv_mode[c]);
     630       49845 :             td->counts.mv_mode[c][b->mode[0] - 10]++;
     631       49845 :             ff_vp9_fill_mv(td, b->mv[0], b->mode[0], 0);
     632             : 
     633       49845 :             if (b->bs != BS_8x4) {
     634       32560 :                 b->mode[1] = vp8_rac_get_tree(td->c, ff_vp9_inter_mode_tree,
     635       32560 :                                               s->prob.p.mv_mode[c]);
     636       32560 :                 td->counts.mv_mode[c][b->mode[1] - 10]++;
     637       32560 :                 ff_vp9_fill_mv(td, b->mv[1], b->mode[1], 1);
     638             :             } else {
     639       17285 :                 b->mode[1] = b->mode[0];
     640       17285 :                 AV_COPY32(&b->mv[1][0], &b->mv[0][0]);
     641       17285 :                 AV_COPY32(&b->mv[1][1], &b->mv[0][1]);
     642             :             }
     643             : 
     644       49845 :             if (b->bs != BS_4x8) {
     645       34131 :                 b->mode[2] = vp8_rac_get_tree(td->c, ff_vp9_inter_mode_tree,
     646       34131 :                                               s->prob.p.mv_mode[c]);
     647       34131 :                 td->counts.mv_mode[c][b->mode[2] - 10]++;
     648       34131 :                 ff_vp9_fill_mv(td, b->mv[2], b->mode[2], 2);
     649             : 
     650       34131 :                 if (b->bs != BS_8x4) {
     651       16846 :                     b->mode[3] = vp8_rac_get_tree(td->c, ff_vp9_inter_mode_tree,
     652       16846 :                                                   s->prob.p.mv_mode[c]);
     653       16846 :                     td->counts.mv_mode[c][b->mode[3] - 10]++;
     654       16846 :                     ff_vp9_fill_mv(td, b->mv[3], b->mode[3], 3);
     655             :                 } else {
     656       17285 :                     b->mode[3] = b->mode[2];
     657       17285 :                     AV_COPY32(&b->mv[3][0], &b->mv[2][0]);
     658       17285 :                     AV_COPY32(&b->mv[3][1], &b->mv[2][1]);
     659             :                 }
     660             :             } else {
     661       15714 :                 b->mode[2] = b->mode[0];
     662       15714 :                 AV_COPY32(&b->mv[2][0], &b->mv[0][0]);
     663       15714 :                 AV_COPY32(&b->mv[2][1], &b->mv[0][1]);
     664       15714 :                 b->mode[3] = b->mode[1];
     665       15714 :                 AV_COPY32(&b->mv[3][0], &b->mv[1][0]);
     666       15714 :                 AV_COPY32(&b->mv[3][1], &b->mv[1][1]);
     667             :             }
     668             :         } else {
     669      478633 :             ff_vp9_fill_mv(td, b->mv[0], b->mode[0], -1);
     670      478633 :             AV_COPY32(&b->mv[1][0], &b->mv[0][0]);
     671      478633 :             AV_COPY32(&b->mv[2][0], &b->mv[0][0]);
     672      478633 :             AV_COPY32(&b->mv[3][0], &b->mv[0][0]);
     673      478633 :             AV_COPY32(&b->mv[1][1], &b->mv[0][1]);
     674      478633 :             AV_COPY32(&b->mv[2][1], &b->mv[0][1]);
     675      478633 :             AV_COPY32(&b->mv[3][1], &b->mv[0][1]);
     676             :         }
     677             : 
     678      528478 :         vref = b->ref[b->comp ? s->s.h.signbias[s->s.h.varcompref[0]] : 0];
     679             :     }
     680             : 
     681             : #if HAVE_FAST_64BIT
     682             : #define SPLAT_CTX(var, val, n) \
     683             :     switch (n) { \
     684             :     case 1:  var = val;                                    break; \
     685             :     case 2:  AV_WN16A(&var, val *             0x0101);     break; \
     686             :     case 4:  AV_WN32A(&var, val *         0x01010101);     break; \
     687             :     case 8:  AV_WN64A(&var, val * 0x0101010101010101ULL);  break; \
     688             :     case 16: { \
     689             :         uint64_t v64 = val * 0x0101010101010101ULL; \
     690             :         AV_WN64A(              &var,     v64); \
     691             :         AV_WN64A(&((uint8_t *) &var)[8], v64); \
     692             :         break; \
     693             :     } \
     694             :     }
     695             : #else
     696             : #define SPLAT_CTX(var, val, n) \
     697             :     switch (n) { \
     698             :     case 1:  var = val;                         break; \
     699             :     case 2:  AV_WN16A(&var, val *     0x0101);  break; \
     700             :     case 4:  AV_WN32A(&var, val * 0x01010101);  break; \
     701             :     case 8: { \
     702             :         uint32_t v32 = val * 0x01010101; \
     703             :         AV_WN32A(              &var,     v32); \
     704             :         AV_WN32A(&((uint8_t *) &var)[4], v32); \
     705             :         break; \
     706             :     } \
     707             :     case 16: { \
     708             :         uint32_t v32 = val * 0x01010101; \
     709             :         AV_WN32A(              &var,      v32); \
     710             :         AV_WN32A(&((uint8_t *) &var)[4],  v32); \
     711             :         AV_WN32A(&((uint8_t *) &var)[8],  v32); \
     712             :         AV_WN32A(&((uint8_t *) &var)[12], v32); \
     713             :         break; \
     714             :     } \
     715             :     }
     716             : #endif
     717             : 
     718      778642 :     switch (ff_vp9_bwh_tab[1][b->bs][0]) {
     719             : #define SET_CTXS(perf, dir, off, n) \
     720             :     do { \
     721             :         SPLAT_CTX(perf->dir##_skip_ctx[off],      b->skip,          n); \
     722             :         SPLAT_CTX(perf->dir##_txfm_ctx[off],      b->tx,            n); \
     723             :         SPLAT_CTX(perf->dir##_partition_ctx[off], dir##_ctx[b->bs], n); \
     724             :         if (!s->s.h.keyframe && !s->s.h.intraonly) { \
     725             :             SPLAT_CTX(perf->dir##_intra_ctx[off], b->intra,   n); \
     726             :             SPLAT_CTX(perf->dir##_comp_ctx[off],  b->comp,    n); \
     727             :             SPLAT_CTX(perf->dir##_mode_ctx[off],  b->mode[3], n); \
     728             :             if (!b->intra) { \
     729             :                 SPLAT_CTX(perf->dir##_ref_ctx[off], vref, n); \
     730             :                 if (s->s.h.filtermode == FILTER_SWITCHABLE) { \
     731             :                     SPLAT_CTX(perf->dir##_filter_ctx[off], filter_id, n); \
     732             :                 } \
     733             :             } \
     734             :         } \
     735             :     } while (0)
     736      524290 :     case 1: SET_CTXS(s, above, col, 1); break;
     737      198921 :     case 2: SET_CTXS(s, above, col, 2); break;
     738       43028 :     case 4: SET_CTXS(s, above, col, 4); break;
     739       12403 :     case 8: SET_CTXS(s, above, col, 8); break;
     740             :     }
     741      778642 :     switch (ff_vp9_bwh_tab[1][b->bs][1]) {
     742      532978 :     case 1: SET_CTXS(td, left, row7, 1); break;
     743      192738 :     case 2: SET_CTXS(td, left, row7, 2); break;
     744       41396 :     case 4: SET_CTXS(td, left, row7, 4); break;
     745       11530 :     case 8: SET_CTXS(td, left, row7, 8); break;
     746             :     }
     747             : #undef SPLAT_CTX
     748             : #undef SET_CTXS
     749             : 
     750      778642 :     if (!s->s.h.keyframe && !s->s.h.intraonly) {
     751      560596 :         if (b->bs > BS_8x8) {
     752       51507 :             int mv0 = AV_RN32A(&b->mv[3][0]), mv1 = AV_RN32A(&b->mv[3][1]);
     753             : 
     754       51507 :             AV_COPY32(&td->left_mv_ctx[row7 * 2 + 0][0], &b->mv[1][0]);
     755       51507 :             AV_COPY32(&td->left_mv_ctx[row7 * 2 + 0][1], &b->mv[1][1]);
     756       51507 :             AV_WN32A(&td->left_mv_ctx[row7 * 2 + 1][0], mv0);
     757       51507 :             AV_WN32A(&td->left_mv_ctx[row7 * 2 + 1][1], mv1);
     758       51507 :             AV_COPY32(&s->above_mv_ctx[col * 2 + 0][0], &b->mv[2][0]);
     759       51507 :             AV_COPY32(&s->above_mv_ctx[col * 2 + 0][1], &b->mv[2][1]);
     760       51507 :             AV_WN32A(&s->above_mv_ctx[col * 2 + 1][0], mv0);
     761       51507 :             AV_WN32A(&s->above_mv_ctx[col * 2 + 1][1], mv1);
     762             :         } else {
     763      509089 :             int n, mv0 = AV_RN32A(&b->mv[3][0]), mv1 = AV_RN32A(&b->mv[3][1]);
     764             : 
     765     2210633 :             for (n = 0; n < w4 * 2; n++) {
     766     1701544 :                 AV_WN32A(&s->above_mv_ctx[col * 2 + n][0], mv0);
     767     1701544 :                 AV_WN32A(&s->above_mv_ctx[col * 2 + n][1], mv1);
     768             :             }
     769     2190525 :             for (n = 0; n < h4 * 2; n++) {
     770     1681436 :                 AV_WN32A(&td->left_mv_ctx[row7 * 2 + n][0], mv0);
     771     1681436 :                 AV_WN32A(&td->left_mv_ctx[row7 * 2 + n][1], mv1);
     772             :             }
     773             :         }
     774             :     }
     775             : 
     776             :     // FIXME kinda ugly
     777     1952231 :     for (y = 0; y < h4; y++) {
     778     1173589 :         int x, o = (row + y) * s->sb_cols * 8 + col;
     779     1173589 :         VP9mvrefPair *mv = &s->s.frames[CUR_FRAME].mv[o];
     780             : 
     781     1173589 :         if (b->intra) {
     782      871872 :             for (x = 0; x < w4; x++) {
     783     1095590 :                 mv[x].ref[0] =
     784     1095590 :                 mv[x].ref[1] = -1;
     785             :             }
     786      849512 :         } else if (b->comp) {
     787      206417 :             for (x = 0; x < w4; x++) {
     788      153674 :                 mv[x].ref[0] = b->ref[0];
     789      153674 :                 mv[x].ref[1] = b->ref[1];
     790      153674 :                 AV_COPY32(&mv[x].mv[0], &b->mv[3][0]);
     791      153674 :                 AV_COPY32(&mv[x].mv[1], &b->mv[3][1]);
     792             :             }
     793             :         } else {
     794     2737479 :             for (x = 0; x < w4; x++) {
     795     1940710 :                 mv[x].ref[0] = b->ref[0];
     796     1940710 :                 mv[x].ref[1] = -1;
     797     1940710 :                 AV_COPY32(&mv[x].mv[0], &b->mv[3][0]);
     798             :             }
     799             :         }
     800             :     }
     801      778642 : }
     802             : 
     803             : // FIXME merge cnt/eob arguments?
     804             : static av_always_inline int
     805     1937677 : decode_coeffs_b_generic(VP56RangeCoder *c, int16_t *coef, int n_coeffs,
     806             :                         int is_tx32x32, int is8bitsperpixel, int bpp, unsigned (*cnt)[6][3],
     807             :                         unsigned (*eob)[6][2], uint8_t (*p)[6][11],
     808             :                         int nnz, const int16_t *scan, const int16_t (*nb)[2],
     809             :                         const int16_t *band_counts, int16_t *qmul)
     810             : {
     811     1937677 :     int i = 0, band = 0, band_left = band_counts[band];
     812     1937677 :     const uint8_t *tp = p[0][nnz];
     813             :     uint8_t cache[1024];
     814             : 
     815             :     do {
     816             :         int val, rc;
     817             : 
     818    10347728 :         val = vp56_rac_get_prob_branchy(c, tp[0]); // eob
     819    10347728 :         eob[band][nnz][val]++;
     820    10347728 :         if (!val)
     821     1882638 :             break;
     822             : 
     823     8465090 : skip_eob:
     824    16861947 :         if (!vp56_rac_get_prob_branchy(c, tp[1])) { // zero
     825     8396857 :             cnt[band][nnz][0]++;
     826     8396857 :             if (!--band_left)
     827     1160580 :                 band_left = band_counts[++band];
     828     8396857 :             cache[scan[i]] = 0;
     829     8396857 :             nnz            = (1 + cache[nb[i][0]] + cache[nb[i][1]]) >> 1;
     830     8396857 :             tp             = p[band][nnz];
     831     8396857 :             if (++i == n_coeffs)
     832           0 :                 break;  //invalid input; blocks should end with EOB
     833     8396857 :             goto skip_eob;
     834             :         }
     835             : 
     836     8465090 :         rc = scan[i];
     837     8465090 :         if (!vp56_rac_get_prob_branchy(c, tp[2])) { // one
     838     4837879 :             cnt[band][nnz][1]++;
     839     4837879 :             val       = 1;
     840     4837879 :             cache[rc] = 1;
     841             :         } else {
     842     3627211 :             cnt[band][nnz][2]++;
     843     3627211 :             if (!vp56_rac_get_prob_branchy(c, tp[3])) { // 2, 3, 4
     844     2408217 :                 if (!vp56_rac_get_prob_branchy(c, tp[4])) {
     845     1376428 :                     cache[rc] = val = 2;
     846             :                 } else {
     847     1031789 :                     val       = 3 + vp56_rac_get_prob(c, tp[5]);
     848     1031789 :                     cache[rc] = 3;
     849             :                 }
     850     1218994 :             } else if (!vp56_rac_get_prob_branchy(c, tp[6])) { // cat1/2
     851      767440 :                 cache[rc] = 4;
     852      767440 :                 if (!vp56_rac_get_prob_branchy(c, tp[7])) {
     853      410210 :                     val  =  vp56_rac_get_prob(c, 159) + 5;
     854             :                 } else {
     855      357230 :                     val  = (vp56_rac_get_prob(c, 165) << 1) + 7;
     856      357230 :                     val +=  vp56_rac_get_prob(c, 145);
     857             :                 }
     858             :             } else { // cat 3-6
     859      451554 :                 cache[rc] = 5;
     860      451554 :                 if (!vp56_rac_get_prob_branchy(c, tp[8])) {
     861      376136 :                     if (!vp56_rac_get_prob_branchy(c, tp[9])) {
     862      243393 :                         val  = 11 + (vp56_rac_get_prob(c, 173) << 2);
     863      243393 :                         val +=      (vp56_rac_get_prob(c, 148) << 1);
     864      243393 :                         val +=       vp56_rac_get_prob(c, 140);
     865             :                     } else {
     866      132743 :                         val  = 19 + (vp56_rac_get_prob(c, 176) << 3);
     867      132743 :                         val +=      (vp56_rac_get_prob(c, 155) << 2);
     868      132743 :                         val +=      (vp56_rac_get_prob(c, 140) << 1);
     869      132743 :                         val +=       vp56_rac_get_prob(c, 135);
     870             :                     }
     871       75418 :                 } else if (!vp56_rac_get_prob_branchy(c, tp[10])) {
     872       55179 :                     val  = (vp56_rac_get_prob(c, 180) << 4) + 35;
     873       55179 :                     val += (vp56_rac_get_prob(c, 157) << 3);
     874       55179 :                     val += (vp56_rac_get_prob(c, 141) << 2);
     875       55179 :                     val += (vp56_rac_get_prob(c, 134) << 1);
     876       55179 :                     val +=  vp56_rac_get_prob(c, 130);
     877             :                 } else {
     878       20239 :                     val = 67;
     879       20239 :                     if (!is8bitsperpixel) {
     880          81 :                         if (bpp == 12) {
     881          60 :                             val += vp56_rac_get_prob(c, 255) << 17;
     882          60 :                             val += vp56_rac_get_prob(c, 255) << 16;
     883             :                         }
     884          81 :                         val +=  (vp56_rac_get_prob(c, 255) << 15);
     885          81 :                         val +=  (vp56_rac_get_prob(c, 255) << 14);
     886             :                     }
     887       20239 :                     val += (vp56_rac_get_prob(c, 254) << 13);
     888       20239 :                     val += (vp56_rac_get_prob(c, 254) << 12);
     889       20239 :                     val += (vp56_rac_get_prob(c, 254) << 11);
     890       20239 :                     val += (vp56_rac_get_prob(c, 252) << 10);
     891       20239 :                     val += (vp56_rac_get_prob(c, 249) << 9);
     892       20239 :                     val += (vp56_rac_get_prob(c, 243) << 8);
     893       20239 :                     val += (vp56_rac_get_prob(c, 230) << 7);
     894       20239 :                     val += (vp56_rac_get_prob(c, 196) << 6);
     895       20239 :                     val += (vp56_rac_get_prob(c, 177) << 5);
     896       20239 :                     val += (vp56_rac_get_prob(c, 153) << 4);
     897       20239 :                     val += (vp56_rac_get_prob(c, 140) << 3);
     898       20239 :                     val += (vp56_rac_get_prob(c, 133) << 2);
     899       20239 :                     val += (vp56_rac_get_prob(c, 130) << 1);
     900       20239 :                     val +=  vp56_rac_get_prob(c, 129);
     901             :                 }
     902             :             }
     903             :         }
     904             : #define STORE_COEF(c, i, v) do { \
     905             :     if (is8bitsperpixel) { \
     906             :         c[i] = v; \
     907             :     } else { \
     908             :         AV_WN32A(&c[i * 2], v); \
     909             :     } \
     910             : } while (0)
     911     8465090 :         if (!--band_left)
     912     2082975 :             band_left = band_counts[++band];
     913     8465090 :         if (is_tx32x32)
     914     1346536 :             STORE_COEF(coef, rc, (int)((vp8_rac_get(c) ? -val : val) * (unsigned)qmul[!!i]) / 2);
     915             :         else
     916     7118554 :             STORE_COEF(coef, rc, (vp8_rac_get(c) ? -val : val) * (unsigned)qmul[!!i]);
     917     8465090 :         nnz = (1 + cache[nb[i][0]] + cache[nb[i][1]]) >> 1;
     918     8465090 :         tp = p[band][nnz];
     919     8465090 :     } while (++i < n_coeffs);
     920             : 
     921     1937677 :     return i;
     922             : }
     923             : 
     924     1827166 : static int decode_coeffs_b_8bpp(VP9TileData *td, int16_t *coef, int n_coeffs,
     925             :                                 unsigned (*cnt)[6][3], unsigned (*eob)[6][2],
     926             :                                 uint8_t (*p)[6][11], int nnz, const int16_t *scan,
     927             :                                 const int16_t (*nb)[2], const int16_t *band_counts,
     928             :                                 int16_t *qmul)
     929             : {
     930     1827166 :     return decode_coeffs_b_generic(td->c, coef, n_coeffs, 0, 1, 8, cnt, eob, p,
     931             :                                    nnz, scan, nb, band_counts, qmul);
     932             : }
     933             : 
     934       12272 : static int decode_coeffs_b32_8bpp(VP9TileData *td, int16_t *coef, int n_coeffs,
     935             :                                   unsigned (*cnt)[6][3], unsigned (*eob)[6][2],
     936             :                                   uint8_t (*p)[6][11], int nnz, const int16_t *scan,
     937             :                                   const int16_t (*nb)[2], const int16_t *band_counts,
     938             :                                   int16_t *qmul)
     939             : {
     940       12272 :     return decode_coeffs_b_generic(td->c, coef, n_coeffs, 1, 1, 8, cnt, eob, p,
     941             :                                    nnz, scan, nb, band_counts, qmul);
     942             : }
     943             : 
     944       98124 : static int decode_coeffs_b_16bpp(VP9TileData *td, int16_t *coef, int n_coeffs,
     945             :                                  unsigned (*cnt)[6][3], unsigned (*eob)[6][2],
     946             :                                  uint8_t (*p)[6][11], int nnz, const int16_t *scan,
     947             :                                  const int16_t (*nb)[2], const int16_t *band_counts,
     948             :                                  int16_t *qmul)
     949             : {
     950       98124 :     return decode_coeffs_b_generic(td->c, coef, n_coeffs, 0, 0, td->s->s.h.bpp, cnt, eob, p,
     951             :                                    nnz, scan, nb, band_counts, qmul);
     952             : }
     953             : 
     954         115 : static int decode_coeffs_b32_16bpp(VP9TileData *td, int16_t *coef, int n_coeffs,
     955             :                                    unsigned (*cnt)[6][3], unsigned (*eob)[6][2],
     956             :                                    uint8_t (*p)[6][11], int nnz, const int16_t *scan,
     957             :                                    const int16_t (*nb)[2], const int16_t *band_counts,
     958             :                                    int16_t *qmul)
     959             : {
     960         115 :     return decode_coeffs_b_generic(td->c, coef, n_coeffs, 1, 0, td->s->s.h.bpp, cnt, eob, p,
     961             :                                    nnz, scan, nb, band_counts, qmul);
     962             : }
     963             : 
     964      356165 : static av_always_inline int decode_coeffs(VP9TileData *td, int is8bitsperpixel)
     965             : {
     966      356165 :     VP9Context *s = td->s;
     967      356165 :     VP9Block *b = td->b;
     968      356165 :     int row = td->row, col = td->col;
     969      356165 :     uint8_t (*p)[6][11] = s->prob.coef[b->tx][0 /* y */][!b->intra];
     970      356165 :     unsigned (*c)[6][3] = td->counts.coef[b->tx][0 /* y */][!b->intra];
     971      356165 :     unsigned (*e)[6][2] = td->counts.eob[b->tx][0 /* y */][!b->intra];
     972      356165 :     int w4 = ff_vp9_bwh_tab[1][b->bs][0] << 1, h4 = ff_vp9_bwh_tab[1][b->bs][1] << 1;
     973      356165 :     int end_x = FFMIN(2 * (s->cols - col), w4);
     974      356165 :     int end_y = FFMIN(2 * (s->rows - row), h4);
     975             :     int n, pl, x, y, ret;
     976      356165 :     int16_t (*qmul)[2] = s->s.h.segmentation.feat[b->seg_id].qmul;
     977      356165 :     int tx = 4 * s->s.h.lossless + b->tx;
     978      356165 :     const int16_t * const *yscans = ff_vp9_scans[tx];
     979      356165 :     const int16_t (* const * ynbs)[2] = ff_vp9_scans_nb[tx];
     980      356165 :     const int16_t *uvscan = ff_vp9_scans[b->uvtx][DCT_DCT];
     981      356165 :     const int16_t (*uvnb)[2] = ff_vp9_scans_nb[b->uvtx][DCT_DCT];
     982      356165 :     uint8_t *a = &s->above_y_nnz_ctx[col * 2];
     983      356165 :     uint8_t *l = &td->left_y_nnz_ctx[(row & 7) << 1];
     984             :     static const int16_t band_counts[4][8] = {
     985             :         { 1, 2, 3, 4,  3,   16 - 13 },
     986             :         { 1, 2, 3, 4, 11,   64 - 21 },
     987             :         { 1, 2, 3, 4, 11,  256 - 21 },
     988             :         { 1, 2, 3, 4, 11, 1024 - 21 },
     989             :     };
     990      356165 :     const int16_t *y_band_counts = band_counts[b->tx];
     991      356165 :     const int16_t *uv_band_counts = band_counts[b->uvtx];
     992      356165 :     int bytesperpixel = is8bitsperpixel ? 1 : 2;
     993      356165 :     int total_coeff = 0;
     994             : 
     995             : #define MERGE(la, end, step, rd) \
     996             :     for (n = 0; n < end; n += step) \
     997             :         la[n] = !!rd(&la[n])
     998             : #define MERGE_CTX(step, rd) \
     999             :     do { \
    1000             :         MERGE(l, end_y, step, rd); \
    1001             :         MERGE(a, end_x, step, rd); \
    1002             :     } while (0)
    1003             : 
    1004             : #define DECODE_Y_COEF_LOOP(step, mode_index, v) \
    1005             :     for (n = 0, y = 0; y < end_y; y += step) { \
    1006             :         for (x = 0; x < end_x; x += step, n += step * step) { \
    1007             :             enum TxfmType txtp = ff_vp9_intra_txfm_type[b->mode[mode_index]]; \
    1008             :             ret = (is8bitsperpixel ? decode_coeffs_b##v##_8bpp : decode_coeffs_b##v##_16bpp) \
    1009             :                                     (td, td->block + 16 * n * bytesperpixel, 16 * step * step, \
    1010             :                                      c, e, p, a[x] + l[y], yscans[txtp], \
    1011             :                                      ynbs[txtp], y_band_counts, qmul[0]); \
    1012             :             a[x] = l[y] = !!ret; \
    1013             :             total_coeff |= !!ret; \
    1014             :             if (step >= 4) { \
    1015             :                 AV_WN16A(&td->eob[n], ret); \
    1016             :             } else { \
    1017             :                 td->eob[n] = ret; \
    1018             :             } \
    1019             :         } \
    1020             :     }
    1021             : 
    1022             : #define SPLAT(la, end, step, cond) \
    1023             :     if (step == 2) { \
    1024             :         for (n = 1; n < end; n += step) \
    1025             :             la[n] = la[n - 1]; \
    1026             :     } else if (step == 4) { \
    1027             :         if (cond) { \
    1028             :             for (n = 0; n < end; n += step) \
    1029             :                 AV_WN32A(&la[n], la[n] * 0x01010101); \
    1030             :         } else { \
    1031             :             for (n = 0; n < end; n += step) \
    1032             :                 memset(&la[n + 1], la[n], FFMIN(end - n - 1, 3)); \
    1033             :         } \
    1034             :     } else /* step == 8 */ { \
    1035             :         if (cond) { \
    1036             :             if (HAVE_FAST_64BIT) { \
    1037             :                 for (n = 0; n < end; n += step) \
    1038             :                     AV_WN64A(&la[n], la[n] * 0x0101010101010101ULL); \
    1039             :             } else { \
    1040             :                 for (n = 0; n < end; n += step) { \
    1041             :                     uint32_t v32 = la[n] * 0x01010101; \
    1042             :                     AV_WN32A(&la[n],     v32); \
    1043             :                     AV_WN32A(&la[n + 4], v32); \
    1044             :                 } \
    1045             :             } \
    1046             :         } else { \
    1047             :             for (n = 0; n < end; n += step) \
    1048             :                 memset(&la[n + 1], la[n], FFMIN(end - n - 1, 7)); \
    1049             :         } \
    1050             :     }
    1051             : #define SPLAT_CTX(step) \
    1052             :     do { \
    1053             :         SPLAT(a, end_x, step, end_x == w4); \
    1054             :         SPLAT(l, end_y, step, end_y == h4); \
    1055             :     } while (0)
    1056             : 
    1057             :     /* y tokens */
    1058      356165 :     switch (b->tx) {
    1059      175788 :     case TX_4X4:
    1060      175788 :         DECODE_Y_COEF_LOOP(1, b->bs > BS_8x8 ? n : 0,);
    1061      175788 :         break;
    1062      136248 :     case TX_8X8:
    1063      136248 :         MERGE_CTX(2, AV_RN16A);
    1064      136248 :         DECODE_Y_COEF_LOOP(2, 0,);
    1065      136248 :         SPLAT_CTX(2);
    1066      136248 :         break;
    1067       36298 :     case TX_16X16:
    1068       36298 :         MERGE_CTX(4, AV_RN32A);
    1069       36298 :         DECODE_Y_COEF_LOOP(4, 0,);
    1070       36298 :         SPLAT_CTX(4);
    1071       36298 :         break;
    1072        7831 :     case TX_32X32:
    1073        7831 :         MERGE_CTX(8, AV_RN64A);
    1074        7831 :         DECODE_Y_COEF_LOOP(8, 0, 32);
    1075        7831 :         SPLAT_CTX(8);
    1076        7831 :         break;
    1077             :     }
    1078             : 
    1079             : #define DECODE_UV_COEF_LOOP(step, v) \
    1080             :     for (n = 0, y = 0; y < end_y; y += step) { \
    1081             :         for (x = 0; x < end_x; x += step, n += step * step) { \
    1082             :             ret = (is8bitsperpixel ? decode_coeffs_b##v##_8bpp : decode_coeffs_b##v##_16bpp) \
    1083             :                                     (td, td->uvblock[pl] + 16 * n * bytesperpixel, \
    1084             :                                      16 * step * step, c, e, p, a[x] + l[y], \
    1085             :                                      uvscan, uvnb, uv_band_counts, qmul[1]); \
    1086             :             a[x] = l[y] = !!ret; \
    1087             :             total_coeff |= !!ret; \
    1088             :             if (step >= 4) { \
    1089             :                 AV_WN16A(&td->uveob[pl][n], ret); \
    1090             :             } else { \
    1091             :                 td->uveob[pl][n] = ret; \
    1092             :             } \
    1093             :         } \
    1094             :     }
    1095             : 
    1096      356165 :     p = s->prob.coef[b->uvtx][1 /* uv */][!b->intra];
    1097      356165 :     c = td->counts.coef[b->uvtx][1 /* uv */][!b->intra];
    1098      356165 :     e = td->counts.eob[b->uvtx][1 /* uv */][!b->intra];
    1099      356165 :     w4 >>= s->ss_h;
    1100      356165 :     end_x >>= s->ss_h;
    1101      356165 :     h4 >>= s->ss_v;
    1102      356165 :     end_y >>= s->ss_v;
    1103     1068495 :     for (pl = 0; pl < 2; pl++) {
    1104      712330 :         a = &s->above_uv_nnz_ctx[pl][col << !s->ss_h];
    1105      712330 :         l = &td->left_uv_nnz_ctx[pl][(row & 7) << !s->ss_v];
    1106      712330 :         switch (b->uvtx) {
    1107      595192 :         case TX_4X4:
    1108      595192 :             DECODE_UV_COEF_LOOP(1,);
    1109      595192 :             break;
    1110       98916 :         case TX_8X8:
    1111       98916 :             MERGE_CTX(2, AV_RN16A);
    1112       98916 :             DECODE_UV_COEF_LOOP(2,);
    1113       98916 :             SPLAT_CTX(2);
    1114       98916 :             break;
    1115       16474 :         case TX_16X16:
    1116       16474 :             MERGE_CTX(4, AV_RN32A);
    1117       16474 :             DECODE_UV_COEF_LOOP(4,);
    1118       16474 :             SPLAT_CTX(4);
    1119       16474 :             break;
    1120        1748 :         case TX_32X32:
    1121        1748 :             MERGE_CTX(8, AV_RN64A);
    1122        1748 :             DECODE_UV_COEF_LOOP(8, 32);
    1123        1748 :             SPLAT_CTX(8);
    1124        1748 :             break;
    1125             :         }
    1126             :     }
    1127             : 
    1128      356165 :     return total_coeff;
    1129             : }
    1130             : 
    1131      343320 : static int decode_coeffs_8bpp(VP9TileData *td)
    1132             : {
    1133      343320 :     return decode_coeffs(td, 1);
    1134             : }
    1135             : 
    1136       12845 : static int decode_coeffs_16bpp(VP9TileData *td)
    1137             : {
    1138       12845 :     return decode_coeffs(td, 0);
    1139             : }
    1140             : 
    1141     1489209 : static av_always_inline void mask_edges(uint8_t (*mask)[8][4], int ss_h, int ss_v,
    1142             :                                         int row_and_7, int col_and_7,
    1143             :                                         int w, int h, int col_end, int row_end,
    1144             :                                         enum TxfmMode tx, int skip_inter)
    1145             : {
    1146             :     static const unsigned wide_filter_col_mask[2] = { 0x11, 0x01 };
    1147             :     static const unsigned wide_filter_row_mask[2] = { 0x03, 0x07 };
    1148             : 
    1149             :     // FIXME I'm pretty sure all loops can be replaced by a single LUT if
    1150             :     // we make VP9Filter.mask uint64_t (i.e. row/col all single variable)
    1151             :     // and make the LUT 5-indexed (bl, bp, is_uv, tx and row/col), and then
    1152             :     // use row_and_7/col_and_7 as shifts (1*col_and_7+8*row_and_7)
    1153             : 
    1154             :     // the intended behaviour of the vp9 loopfilter is to work on 8-pixel
    1155             :     // edges. This means that for UV, we work on two subsampled blocks at
    1156             :     // a time, and we only use the topleft block's mode information to set
    1157             :     // things like block strength. Thus, for any block size smaller than
    1158             :     // 16x16, ignore the odd portion of the block.
    1159     1489209 :     if (tx == TX_4X4 && (ss_v | ss_h)) {
    1160      546273 :         if (h == ss_v) {
    1161      500728 :             if (row_and_7 & 1)
    1162      248286 :                 return;
    1163      252442 :             if (!row_end)
    1164      248285 :                 h += 1;
    1165             :         }
    1166      297987 :         if (w == ss_h) {
    1167      267043 :             if (col_and_7 & 1)
    1168      128425 :                 return;
    1169      138618 :             if (!col_end)
    1170      128425 :                 w += 1;
    1171             :         }
    1172             :     }
    1173             : 
    1174     1372552 :     if (tx == TX_4X4 && !skip_inter) {
    1175      260054 :         int t = 1 << col_and_7, m_col = (t << w) - t, y;
    1176             :         // on 32-px edges, use the 8-px wide loopfilter; else, use 4-px wide
    1177      260054 :         int m_row_8 = m_col & wide_filter_col_mask[ss_h], m_row_4 = m_col - m_row_8;
    1178             : 
    1179      620285 :         for (y = row_and_7; y < h + row_and_7; y++) {
    1180      360231 :             int col_mask_id = 2 - !(y & wide_filter_row_mask[ss_v]);
    1181             : 
    1182      360231 :             mask[0][y][1] |= m_row_8;
    1183      360231 :             mask[0][y][2] |= m_row_4;
    1184             :             // for odd lines, if the odd col is not being filtered,
    1185             :             // skip odd row also:
    1186             :             // .---. <-- a
    1187             :             // |   |
    1188             :             // |___| <-- b
    1189             :             // ^   ^
    1190             :             // c   d
    1191             :             //
    1192             :             // if a/c are even row/col and b/d are odd, and d is skipped,
    1193             :             // e.g. right edge of size-66x66.webm, then skip b also (bug)
    1194      360231 :             if ((ss_h & ss_v) && (col_end & 1) && (y & 1)) {
    1195        4224 :                 mask[1][y][col_mask_id] |= (t << (w - 1)) - t;
    1196             :             } else {
    1197      356007 :                 mask[1][y][col_mask_id] |= m_col;
    1198             :             }
    1199      360231 :             if (!ss_h)
    1200      185095 :                 mask[0][y][3] |= m_col;
    1201      360231 :             if (!ss_v) {
    1202      179000 :                 if (ss_h && (col_end & 1))
    1203           0 :                     mask[1][y][3] |= (t << (w - 1)) - t;
    1204             :                 else
    1205      179000 :                     mask[1][y][3] |= m_col;
    1206             :             }
    1207             :         }
    1208             :     } else {
    1209      852444 :         int y, t = 1 << col_and_7, m_col = (t << w) - t;
    1210             : 
    1211      852444 :         if (!skip_inter) {
    1212      250923 :             int mask_id = (tx == TX_8X8);
    1213      250923 :             int l2 = tx + ss_h - 1, step1d;
    1214             :             static const unsigned masks[4] = { 0xff, 0x55, 0x11, 0x01 };
    1215      250923 :             int m_row = m_col & masks[l2];
    1216             : 
    1217             :             // at odd UV col/row edges tx16/tx32 loopfilter edges, force
    1218             :             // 8wd loopfilter to prevent going off the visible edge.
    1219      250931 :             if (ss_h && tx > TX_8X8 && (w ^ (w - 1)) == 1) {
    1220           8 :                 int m_row_16 = ((t << (w - 1)) - t) & masks[l2];
    1221           8 :                 int m_row_8 = m_row - m_row_16;
    1222             : 
    1223          60 :                 for (y = row_and_7; y < h + row_and_7; y++) {
    1224          52 :                     mask[0][y][0] |= m_row_16;
    1225          52 :                     mask[0][y][1] |= m_row_8;
    1226             :                 }
    1227             :             } else {
    1228      676972 :                 for (y = row_and_7; y < h + row_and_7; y++)
    1229      426057 :                     mask[0][y][mask_id] |= m_row;
    1230             :             }
    1231             : 
    1232      250923 :             l2 = tx + ss_v - 1;
    1233      250923 :             step1d = 1 << l2;
    1234      250923 :             if (ss_v && tx > TX_8X8 && (h ^ (h - 1)) == 1) {
    1235         152 :                 for (y = row_and_7; y < h + row_and_7 - 1; y += step1d)
    1236          75 :                     mask[1][y][0] |= m_col;
    1237         154 :                 if (y - row_and_7 == h - 1)
    1238           2 :                     mask[1][y][1] |= m_col;
    1239             :             } else {
    1240      532228 :                 for (y = row_and_7; y < h + row_and_7; y += step1d)
    1241      281382 :                     mask[1][y][mask_id] |= m_col;
    1242             :             }
    1243      601521 :         } else if (tx != TX_4X4) {
    1244             :             int mask_id;
    1245             : 
    1246      511872 :             mask_id = (tx == TX_8X8) || (h == ss_v);
    1247      511872 :             mask[1][row_and_7][mask_id] |= m_col;
    1248      511872 :             mask_id = (tx == TX_8X8) || (w == ss_h);
    1249     1538145 :             for (y = row_and_7; y < h + row_and_7; y++)
    1250     1026273 :                 mask[0][y][mask_id] |= t;
    1251             :         } else {
    1252       89649 :             int t8 = t & wide_filter_col_mask[ss_h], t4 = t - t8;
    1253             : 
    1254      254693 :             for (y = row_and_7; y < h + row_and_7; y++) {
    1255      165044 :                 mask[0][y][2] |= t4;
    1256      165044 :                 mask[0][y][1] |= t8;
    1257             :             }
    1258       89649 :             mask[1][row_and_7][2 - !(row_and_7 & wide_filter_row_mask[ss_v])] |= m_col;
    1259             :         }
    1260             :     }
    1261             : }
    1262             : 
    1263      778642 : void ff_vp9_decode_block(VP9TileData *td, int row, int col,
    1264             :                          VP9Filter *lflvl, ptrdiff_t yoff, ptrdiff_t uvoff,
    1265             :                          enum BlockLevel bl, enum BlockPartition bp)
    1266             : {
    1267      778642 :     VP9Context *s = td->s;
    1268      778642 :     VP9Block *b = td->b;
    1269      778642 :     enum BlockSize bs = bl * 3 + bp;
    1270      778642 :     int bytesperpixel = s->bytesperpixel;
    1271      778642 :     int w4 = ff_vp9_bwh_tab[1][bs][0], h4 = ff_vp9_bwh_tab[1][bs][1], lvl;
    1272             :     int emu[2];
    1273      778642 :     AVFrame *f = s->s.frames[CUR_FRAME].tf.f;
    1274             : 
    1275      778642 :     td->row = row;
    1276      778642 :     td->row7 = row & 7;
    1277      778642 :     td->col = col;
    1278      778642 :     td->col7 = col & 7;
    1279             : 
    1280      778642 :     td->min_mv.x = -(128 + col * 64);
    1281      778642 :     td->min_mv.y = -(128 + row * 64);
    1282      778642 :     td->max_mv.x = 128 + (s->cols - col - w4) * 64;
    1283      778642 :     td->max_mv.y = 128 + (s->rows - row - h4) * 64;
    1284             : 
    1285      778642 :     if (s->pass < 2) {
    1286      778642 :         b->bs = bs;
    1287      778642 :         b->bl = bl;
    1288      778642 :         b->bp = bp;
    1289      778642 :         decode_mode(td);
    1290     1609437 :         b->uvtx = b->tx - ((s->ss_h && w4 * 2 == (1 << b->tx)) ||
    1291      552668 :                            (s->ss_v && h4 * 2 == (1 << b->tx)));
    1292             : 
    1293      778642 :         if (!b->skip) {
    1294             :             int has_coeffs;
    1295             : 
    1296      356165 :             if (bytesperpixel == 1) {
    1297      343320 :                 has_coeffs = decode_coeffs_8bpp(td);
    1298             :             } else {
    1299       12845 :                 has_coeffs = decode_coeffs_16bpp(td);
    1300             :             }
    1301      356165 :             if (!has_coeffs && b->bs <= BS_8x8 && !b->intra) {
    1302           0 :                 b->skip = 1;
    1303           0 :                 memset(&s->above_skip_ctx[col], 1, w4);
    1304           0 :                 memset(&td->left_skip_ctx[td->row7], 1, h4);
    1305             :             }
    1306             :         } else {
    1307      422477 :             int row7 = td->row7;
    1308             : 
    1309             : #define SPLAT_ZERO_CTX(v, n) \
    1310             :     switch (n) { \
    1311             :     case 1:  v = 0;          break; \
    1312             :     case 2:  AV_ZERO16(&v);  break; \
    1313             :     case 4:  AV_ZERO32(&v);  break; \
    1314             :     case 8:  AV_ZERO64(&v);  break; \
    1315             :     case 16: AV_ZERO128(&v); break; \
    1316             :     }
    1317             : #define SPLAT_ZERO_YUV(dir, var, off, n, dir2) \
    1318             :     do { \
    1319             :         SPLAT_ZERO_CTX(dir##_y_##var[off * 2], n * 2); \
    1320             :         if (s->ss_##dir2) { \
    1321             :             SPLAT_ZERO_CTX(dir##_uv_##var[0][off], n); \
    1322             :             SPLAT_ZERO_CTX(dir##_uv_##var[1][off], n); \
    1323             :         } else { \
    1324             :             SPLAT_ZERO_CTX(dir##_uv_##var[0][off * 2], n * 2); \
    1325             :             SPLAT_ZERO_CTX(dir##_uv_##var[1][off * 2], n * 2); \
    1326             :         } \
    1327             :     } while (0)
    1328             : 
    1329      422477 :             switch (w4) {
    1330      246028 :             case 1: SPLAT_ZERO_YUV(s->above, nnz_ctx, col, 1, h); break;
    1331      132861 :             case 2: SPLAT_ZERO_YUV(s->above, nnz_ctx, col, 2, h); break;
    1332       32689 :             case 4: SPLAT_ZERO_YUV(s->above, nnz_ctx, col, 4, h); break;
    1333       10899 :             case 8: SPLAT_ZERO_YUV(s->above, nnz_ctx, col, 8, h); break;
    1334             :             }
    1335      422477 :             switch (h4) {
    1336      251202 :             case 1: SPLAT_ZERO_YUV(td->left, nnz_ctx, row7, 1, v); break;
    1337      129624 :             case 2: SPLAT_ZERO_YUV(td->left, nnz_ctx, row7, 2, v); break;
    1338       31454 :             case 4: SPLAT_ZERO_YUV(td->left, nnz_ctx, row7, 4, v); break;
    1339       10197 :             case 8: SPLAT_ZERO_YUV(td->left, nnz_ctx, row7, 8, v); break;
    1340             :             }
    1341             :         }
    1342             : 
    1343      778642 :         if (s->pass == 1) {
    1344           0 :             s->td[0].b++;
    1345           0 :             s->td[0].block += w4 * h4 * 64 * bytesperpixel;
    1346           0 :             s->td[0].uvblock[0] += w4 * h4 * 64 * bytesperpixel >> (s->ss_h + s->ss_v);
    1347           0 :             s->td[0].uvblock[1] += w4 * h4 * 64 * bytesperpixel >> (s->ss_h + s->ss_v);
    1348           0 :             s->td[0].eob += 4 * w4 * h4;
    1349           0 :             s->td[0].uveob[0] += 4 * w4 * h4 >> (s->ss_h + s->ss_v);
    1350           0 :             s->td[0].uveob[1] += 4 * w4 * h4 >> (s->ss_h + s->ss_v);
    1351             : 
    1352           0 :             return;
    1353             :         }
    1354             :     }
    1355             : 
    1356             :     // emulated overhangs if the stride of the target buffer can't hold. This
    1357             :     // makes it possible to support emu-edge and so on even if we have large block
    1358             :     // overhangs
    1359     1557284 :     emu[0] = (col + w4) * 8 * bytesperpixel > f->linesize[0] ||
    1360      778642 :              (row + h4) > s->rows;
    1361     1557284 :     emu[1] = ((col + w4) * 8 >> s->ss_h) * bytesperpixel > f->linesize[1] ||
    1362      778642 :              (row + h4) > s->rows;
    1363      778642 :     if (emu[0]) {
    1364        2220 :         td->dst[0] = td->tmp_y;
    1365        2220 :         td->y_stride = 128;
    1366             :     } else {
    1367      776422 :         td->dst[0] = f->data[0] + yoff;
    1368      776422 :         td->y_stride = f->linesize[0];
    1369             :     }
    1370      778642 :     if (emu[1]) {
    1371        2220 :         td->dst[1] = td->tmp_uv[0];
    1372        2220 :         td->dst[2] = td->tmp_uv[1];
    1373        2220 :         td->uv_stride = 128;
    1374             :     } else {
    1375      776422 :         td->dst[1] = f->data[1] + uvoff;
    1376      776422 :         td->dst[2] = f->data[2] + uvoff;
    1377      776422 :         td->uv_stride = f->linesize[1];
    1378             :     }
    1379      778642 :     if (b->intra) {
    1380      250164 :         if (s->s.h.bpp > 8) {
    1381        4210 :             ff_vp9_intra_recon_16bpp(td, yoff, uvoff);
    1382             :         } else {
    1383      245954 :             ff_vp9_intra_recon_8bpp(td, yoff, uvoff);
    1384             :         }
    1385             :     } else {
    1386      528478 :         if (s->s.h.bpp > 8) {
    1387       14448 :             ff_vp9_inter_recon_16bpp(td);
    1388             :         } else {
    1389      514030 :             ff_vp9_inter_recon_8bpp(td);
    1390             :         }
    1391             :     }
    1392      778642 :     if (emu[0]) {
    1393        2220 :         int w = FFMIN(s->cols - col, w4) * 8, h = FFMIN(s->rows - row, h4) * 8, n, o = 0;
    1394             : 
    1395        6416 :         for (n = 0; o < w; n++) {
    1396        4196 :             int bw = 64 >> n;
    1397             : 
    1398             :             av_assert2(n <= 4);
    1399        4196 :             if (w & bw) {
    1400        4470 :                 s->dsp.mc[n][0][0][0][0](f->data[0] + yoff + o * bytesperpixel, f->linesize[0],
    1401        2235 :                                          td->tmp_y + o * bytesperpixel, 128, h, 0, 0);
    1402        2235 :                 o += bw;
    1403             :             }
    1404             :         }
    1405             :     }
    1406      778642 :     if (emu[1]) {
    1407        2220 :         int w = FFMIN(s->cols - col, w4) * 8 >> s->ss_h;
    1408        2220 :         int h = FFMIN(s->rows - row, h4) * 8 >> s->ss_v, n, o = 0;
    1409             : 
    1410        6416 :         for (n = s->ss_h; o < w; n++) {
    1411        4196 :             int bw = 64 >> n;
    1412             : 
    1413             :             av_assert2(n <= 4);
    1414        4196 :             if (w & bw) {
    1415        4470 :                 s->dsp.mc[n][0][0][0][0](f->data[1] + uvoff + o * bytesperpixel, f->linesize[1],
    1416        2235 :                                          td->tmp_uv[0] + o * bytesperpixel, 128, h, 0, 0);
    1417        4470 :                 s->dsp.mc[n][0][0][0][0](f->data[2] + uvoff + o * bytesperpixel, f->linesize[2],
    1418        2235 :                                          td->tmp_uv[1] + o * bytesperpixel, 128, h, 0, 0);
    1419        2235 :                 o += bw;
    1420             :             }
    1421             :         }
    1422             :     }
    1423             : 
    1424             :     // pick filter level and find edges to apply filter to
    1425     1527866 :     if (s->s.h.filter.level &&
    1426     1498448 :         (lvl = s->s.h.segmentation.feat[b->seg_id].lflvl[b->intra ? 0 : b->ref[0] + 1]
    1427      749224 :                                                       [b->mode[3] != ZEROMV]) > 0) {
    1428      749133 :         int x_end = FFMIN(s->cols - col, w4), y_end = FFMIN(s->rows - row, h4);
    1429      749133 :         int skip_inter = !b->intra && b->skip, col7 = td->col7, row7 = td->row7;
    1430             : 
    1431      749133 :         setctx_2d(&lflvl->level[row7 * 8 + col7], w4, h4, 8, lvl);
    1432      749133 :         mask_edges(lflvl->mask[0], 0, 0, row7, col7, x_end, y_end, 0, 0, b->tx, skip_inter);
    1433      749133 :         if (s->ss_h || s->ss_v)
    1434     2244100 :             mask_edges(lflvl->mask[1], s->ss_h, s->ss_v, row7, col7, x_end, y_end,
    1435     1112979 :                        s->cols & 1 && col + w4 >= s->cols ? s->cols & 7 : 0,
    1436      916951 :                        s->rows & 1 && row + h4 >= s->rows ? s->rows & 7 : 0,
    1437             :                        b->uvtx, skip_inter);
    1438             :     }
    1439             : 
    1440      778642 :     if (s->pass == 2) {
    1441           0 :         s->td[0].b++;
    1442           0 :         s->td[0].block += w4 * h4 * 64 * bytesperpixel;
    1443           0 :         s->td[0].uvblock[0] += w4 * h4 * 64 * bytesperpixel >> (s->ss_v + s->ss_h);
    1444           0 :         s->td[0].uvblock[1] += w4 * h4 * 64 * bytesperpixel >> (s->ss_v + s->ss_h);
    1445           0 :         s->td[0].eob += 4 * w4 * h4;
    1446           0 :         s->td[0].uveob[0] += 4 * w4 * h4 >> (s->ss_v + s->ss_h);
    1447           0 :         s->td[0].uveob[1] += 4 * w4 * h4 >> (s->ss_v + s->ss_h);
    1448             :     }
    1449             : }

Generated by: LCOV version 1.13