| Line | Branch | Exec | Source |
|---|---|---|---|
| 1 | /* | ||
| 2 | * Copyright (c) 2010 Aurelien Jacobs <aurel@gnuage.org> | ||
| 3 | * Copyright (c) 2017 Clément Bœsch <u@pkh.me> | ||
| 4 | * | ||
| 5 | * This file is part of FFmpeg. | ||
| 6 | * | ||
| 7 | * FFmpeg is free software; you can redistribute it and/or | ||
| 8 | * modify it under the terms of the GNU Lesser General Public | ||
| 9 | * License as published by the Free Software Foundation; either | ||
| 10 | * version 2.1 of the License, or (at your option) any later version. | ||
| 11 | * | ||
| 12 | * FFmpeg is distributed in the hope that it will be useful, | ||
| 13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
| 15 | * Lesser General Public License for more details. | ||
| 16 | * | ||
| 17 | * You should have received a copy of the GNU Lesser General Public | ||
| 18 | * License along with FFmpeg; if not, write to the Free Software | ||
| 19 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | ||
| 20 | */ | ||
| 21 | |||
| 22 | #include "libavutil/avassert.h" | ||
| 23 | #include "libavutil/avstring.h" | ||
| 24 | #include "libavutil/common.h" | ||
| 25 | #include "libavutil/parseutils.h" | ||
| 26 | #include "htmlsubtitles.h" | ||
| 27 | #include <ctype.h> | ||
| 28 | |||
| 29 | 97 | static int html_color_parse(void *log_ctx, const char *str) | |
| 30 | { | ||
| 31 | uint8_t rgba[4]; | ||
| 32 | 97 | int nb_sharps = 0; | |
| 33 |
2/2✓ Branch 0 taken 47 times.
✓ Branch 1 taken 97 times.
|
144 | while (str[nb_sharps] == '#') |
| 34 | 47 | nb_sharps++; | |
| 35 |
2/2✓ Branch 0 taken 46 times.
✓ Branch 1 taken 51 times.
|
97 | str += FFMAX(0, nb_sharps - 1); |
| 36 |
1/2✗ Branch 1 not taken.
✓ Branch 2 taken 97 times.
|
97 | if (av_parse_color(rgba, str, strcspn(str, "\" >"), log_ctx) < 0) |
| 37 | ✗ | return -1; | |
| 38 | 97 | return rgba[0] | rgba[1] << 8 | rgba[2] << 16; | |
| 39 | } | ||
| 40 | |||
| 41 | 934 | static void rstrip_spaces_buf(AVBPrint *buf) | |
| 42 | { | ||
| 43 |
1/2✓ Branch 1 taken 934 times.
✗ Branch 2 not taken.
|
934 | if (av_bprint_is_complete(buf)) |
| 44 |
4/4✓ Branch 0 taken 1085 times.
✓ Branch 1 taken 1 times.
✓ Branch 2 taken 152 times.
✓ Branch 3 taken 933 times.
|
1086 | while (buf->len > 0 && buf->str[buf->len - 1] == ' ') |
| 45 | 152 | buf->str[--buf->len] = 0; | |
| 46 | 934 | } | |
| 47 | |||
| 48 | /* | ||
| 49 | * Fast code for scanning text enclosed in braces. Functionally | ||
| 50 | * equivalent to this sscanf call: | ||
| 51 | * | ||
| 52 | * sscanf(in, "{\\an%*1u}%n", &len) >= 0 && len > 0 | ||
| 53 | */ | ||
| 54 | 128 | static int scanbraces(const char* in) { | |
| 55 |
2/2✓ Branch 0 taken 24 times.
✓ Branch 1 taken 104 times.
|
128 | if (strncmp(in, "{\\an", 4) != 0) { |
| 56 | 24 | return 0; | |
| 57 | } | ||
| 58 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 104 times.
|
104 | if (!av_isdigit(in[4])) { |
| 59 | ✗ | return 0; | |
| 60 | } | ||
| 61 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 104 times.
|
104 | if (in[5] != '}') { |
| 62 | ✗ | return 0; | |
| 63 | } | ||
| 64 | 104 | return 1; | |
| 65 | } | ||
| 66 | |||
| 67 | /* skip all {\xxx} substrings except for {\an%d} | ||
| 68 | and all microdvd like styles such as {Y:xxx} */ | ||
| 69 | 128 | static void handle_open_brace(AVBPrint *dst, const char **inp, int *an, int *closing_brace_missing) | |
| 70 | { | ||
| 71 | 128 | const char *in = *inp; | |
| 72 | |||
| 73 | 128 | *an += scanbraces(in); | |
| 74 | |||
| 75 |
1/2✓ Branch 0 taken 128 times.
✗ Branch 1 not taken.
|
128 | if (!*closing_brace_missing) { |
| 76 |
4/4✓ Branch 0 taken 32 times.
✓ Branch 1 taken 96 times.
✓ Branch 2 taken 16 times.
✓ Branch 3 taken 16 times.
|
128 | if ( (*an != 1 && in[1] == '\\') |
| 77 |
4/6✓ Branch 0 taken 112 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 8 times.
✓ Branch 3 taken 104 times.
✓ Branch 4 taken 8 times.
✗ Branch 5 not taken.
|
112 | || (in[1] && strchr("CcFfoPSsYy", in[1]) && in[2] == ':')) { |
| 78 | 24 | char *bracep = strchr(in+2, '}'); | |
| 79 |
1/2✓ Branch 0 taken 24 times.
✗ Branch 1 not taken.
|
24 | if (bracep) { |
| 80 | 24 | *inp = bracep; | |
| 81 | 24 | return; | |
| 82 | } else | ||
| 83 | ✗ | *closing_brace_missing = 1; | |
| 84 | } | ||
| 85 | } | ||
| 86 | |||
| 87 | 104 | av_bprint_chars(dst, *in, 1); | |
| 88 | } | ||
| 89 | |||
| 90 | struct font_tag { | ||
| 91 | char face[128]; | ||
| 92 | int size; | ||
| 93 | uint32_t color; | ||
| 94 | }; | ||
| 95 | |||
| 96 | /* | ||
| 97 | * Fast code for scanning the rest of a tag. Functionally equivalent to | ||
| 98 | * this sscanf call: | ||
| 99 | * | ||
| 100 | * sscanf(in, "%127[^<>]>%n", buffer, lenp) == 2 | ||
| 101 | */ | ||
| 102 | 762 | static int scantag(const char* in, char* buffer, int* lenp) { | |
| 103 | int len; | ||
| 104 | |||
| 105 |
1/2✓ Branch 0 taken 6079 times.
✗ Branch 1 not taken.
|
6079 | for (len = 0; len < 128; len++) { |
| 106 | 6079 | const char c = *in++; | |
| 107 |
4/4✓ Branch 0 taken 11 times.
✓ Branch 1 taken 13 times.
✓ Branch 2 taken 738 times.
✓ Branch 3 taken 5317 times.
|
6079 | switch (c) { |
| 108 | 11 | case '\0': | |
| 109 | 11 | return 0; | |
| 110 | 13 | case '<': | |
| 111 | 13 | return 0; | |
| 112 | 738 | case '>': | |
| 113 | 738 | buffer[len] = '\0'; | |
| 114 | 738 | *lenp = len+1; | |
| 115 | 738 | return 1; | |
| 116 | 5317 | default: | |
| 117 | 5317 | break; | |
| 118 | } | ||
| 119 | 5317 | buffer[len] = c; | |
| 120 | } | ||
| 121 | ✗ | return 0; | |
| 122 | } | ||
| 123 | |||
| 124 | /* | ||
| 125 | * The general politic of the convert is to mask unsupported tags or formatting | ||
| 126 | * errors (but still alert the user/subtitles writer with an error/warning) | ||
| 127 | * without dropping any actual text content for the final user. | ||
| 128 | */ | ||
| 129 | 411 | int ff_htmlmarkup_to_ass(void *log_ctx, AVBPrint *dst, const char *in) | |
| 130 | { | ||
| 131 | char *param, buffer[128]; | ||
| 132 | 411 | int len, tag_close, sptr = 0, line_start = 1, an = 0, end = 0; | |
| 133 | 411 | int closing_brace_missing = 0; | |
| 134 | int i, likely_a_tag; | ||
| 135 | |||
| 136 | /* | ||
| 137 | * state stack is only present for fonts since they are the only tags where | ||
| 138 | * the state is not binary. Here is a typical use case: | ||
| 139 | * | ||
| 140 | * <font color="red" size=10> | ||
| 141 | * red 10 | ||
| 142 | * <font size=50> RED AND BIG </font> | ||
| 143 | * red 10 again | ||
| 144 | * </font> | ||
| 145 | * | ||
| 146 | * On the other hand, using the state system for all the tags should be | ||
| 147 | * avoided because it breaks wrongly nested tags such as: | ||
| 148 | * | ||
| 149 | * <b> foo <i> bar </b> bla </i> | ||
| 150 | * | ||
| 151 | * We don't want to break here; instead, we will treat all these tags as | ||
| 152 | * binary state markers. Basically, "<b>" will activate bold, and "</b>" | ||
| 153 | * will deactivate it, whatever the current state. | ||
| 154 | * | ||
| 155 | * This will also prevents cases where we have a random closing tag | ||
| 156 | * remaining after the opening one was dropped. Yes, this happens and we | ||
| 157 | * still don't want to print a "</b>" at the end of the dialog event. | ||
| 158 | */ | ||
| 159 | struct font_tag stack[16]; | ||
| 160 | |||
| 161 | 411 | memset(&stack[0], 0, sizeof(stack[0])); | |
| 162 | |||
| 163 |
3/4✓ Branch 0 taken 34008 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 33597 times.
✓ Branch 3 taken 411 times.
|
34008 | for (; !end && *in; in++) { |
| 164 |
5/6✗ Branch 0 not taken.
✓ Branch 1 taken 523 times.
✓ Branch 2 taken 4783 times.
✓ Branch 3 taken 128 times.
✓ Branch 4 taken 762 times.
✓ Branch 5 taken 27401 times.
|
33597 | switch (*in) { |
| 165 | ✗ | case '\r': | |
| 166 | ✗ | break; | |
| 167 | 523 | case '\n': | |
| 168 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 523 times.
|
523 | if (line_start) { |
| 169 | ✗ | end = 1; | |
| 170 | ✗ | break; | |
| 171 | } | ||
| 172 | 523 | rstrip_spaces_buf(dst); | |
| 173 | 523 | av_bprintf(dst, "\\N"); | |
| 174 | 523 | line_start = 1; | |
| 175 | 523 | break; | |
| 176 | 4783 | case ' ': | |
| 177 |
2/2✓ Branch 0 taken 4703 times.
✓ Branch 1 taken 80 times.
|
4783 | if (!line_start) |
| 178 | 4703 | av_bprint_chars(dst, *in, 1); | |
| 179 | 4783 | break; | |
| 180 | 128 | case '{': | |
| 181 | 128 | handle_open_brace(dst, &in, &an, &closing_brace_missing); | |
| 182 | 128 | break; | |
| 183 | 762 | case '<': | |
| 184 | /* | ||
| 185 | * "<<" are likely latin guillemets in ASCII or some kind of random | ||
| 186 | * style effect; see sub/badsyntax.srt in the FATE samples | ||
| 187 | * directory for real test cases. | ||
| 188 | */ | ||
| 189 | |||
| 190 | 762 | likely_a_tag = 1; | |
| 191 |
2/2✓ Branch 0 taken 27 times.
✓ Branch 1 taken 762 times.
|
789 | for (i = 0; in[1] == '<'; i++) { |
| 192 | 27 | av_bprint_chars(dst, '<', 1); | |
| 193 | 27 | likely_a_tag = 0; | |
| 194 | 27 | in++; | |
| 195 | } | ||
| 196 | |||
| 197 | 762 | tag_close = in[1] == '/'; | |
| 198 |
2/2✓ Branch 0 taken 352 times.
✓ Branch 1 taken 410 times.
|
762 | if (tag_close) |
| 199 | 352 | likely_a_tag = 1; | |
| 200 | |||
| 201 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 762 times.
|
762 | av_assert0(in[0] == '<'); |
| 202 | |||
| 203 | 762 | len = 0; | |
| 204 | |||
| 205 |
3/4✓ Branch 1 taken 738 times.
✓ Branch 2 taken 24 times.
✓ Branch 3 taken 738 times.
✗ Branch 4 not taken.
|
1500 | if (scantag(in+tag_close+1, buffer, &len) && len > 0) { |
| 206 | 738 | const int skip = len + tag_close; | |
| 207 | 738 | const char *tagname = buffer; | |
| 208 |
2/2✓ Branch 0 taken 5 times.
✓ Branch 1 taken 738 times.
|
743 | while (*tagname == ' ') { |
| 209 | 5 | likely_a_tag = 0; | |
| 210 | 5 | tagname++; | |
| 211 | } | ||
| 212 |
2/2✓ Branch 0 taken 178 times.
✓ Branch 1 taken 560 times.
|
738 | if ((param = strchr(tagname, ' '))) |
| 213 | 178 | *param++ = 0; | |
| 214 | |||
| 215 | /* Check if this is likely a tag */ | ||
| 216 | #define LIKELY_A_TAG_CHAR(x) (((x) >= '0' && (x) <= '9') || \ | ||
| 217 | ((x) >= 'a' && (x) <= 'z') || \ | ||
| 218 | ((x) >= 'A' && (x) <= 'Z') || \ | ||
| 219 | (x) == '_' || (x) == '/') | ||
| 220 |
2/2✓ Branch 0 taken 2730 times.
✓ Branch 1 taken 727 times.
|
3457 | for (i = 0; tagname[i]; i++) { |
| 221 |
15/16✓ Branch 0 taken 2718 times.
✓ Branch 1 taken 12 times.
✓ Branch 2 taken 2691 times.
✓ Branch 3 taken 27 times.
✓ Branch 4 taken 2591 times.
✓ Branch 5 taken 112 times.
✗ Branch 6 not taken.
✓ Branch 7 taken 2591 times.
✓ Branch 8 taken 99 times.
✓ Branch 9 taken 13 times.
✓ Branch 10 taken 88 times.
✓ Branch 11 taken 11 times.
✓ Branch 12 taken 13 times.
✓ Branch 13 taken 88 times.
✓ Branch 14 taken 11 times.
✓ Branch 15 taken 2 times.
|
2730 | if (!LIKELY_A_TAG_CHAR(tagname[i])) { |
| 222 | 11 | likely_a_tag = 0; | |
| 223 | 11 | break; | |
| 224 | } | ||
| 225 | } | ||
| 226 | |||
| 227 |
2/2✓ Branch 1 taken 310 times.
✓ Branch 2 taken 428 times.
|
738 | if (!av_strcasecmp(tagname, "font")) { |
| 228 |
3/4✓ Branch 0 taken 155 times.
✓ Branch 1 taken 155 times.
✓ Branch 2 taken 155 times.
✗ Branch 3 not taken.
|
465 | if (tag_close && sptr > 0) { |
| 229 | 155 | struct font_tag *cur_tag = &stack[sptr--]; | |
| 230 | 155 | struct font_tag *last_tag = &stack[sptr]; | |
| 231 | |||
| 232 |
2/2✓ Branch 0 taken 93 times.
✓ Branch 1 taken 62 times.
|
155 | if (cur_tag->size) { |
| 233 |
2/2✓ Branch 0 taken 77 times.
✓ Branch 1 taken 16 times.
|
93 | if (!last_tag->size) |
| 234 | 77 | av_bprintf(dst, "{\\fs}"); | |
| 235 |
2/2✓ Branch 0 taken 8 times.
✓ Branch 1 taken 8 times.
|
16 | else if (last_tag->size != cur_tag->size) |
| 236 | 8 | av_bprintf(dst, "{\\fs%d}", last_tag->size); | |
| 237 | } | ||
| 238 | |||
| 239 |
2/2✓ Branch 0 taken 121 times.
✓ Branch 1 taken 34 times.
|
155 | if (cur_tag->color & 0xff000000) { |
| 240 |
2/2✓ Branch 0 taken 88 times.
✓ Branch 1 taken 33 times.
|
121 | if (!(last_tag->color & 0xff000000)) |
| 241 | 88 | av_bprintf(dst, "{\\c}"); | |
| 242 |
2/2✓ Branch 0 taken 9 times.
✓ Branch 1 taken 24 times.
|
33 | else if (last_tag->color != cur_tag->color) |
| 243 | 9 | av_bprintf(dst, "{\\c&H%"PRIX32"&}", last_tag->color & 0xffffff); | |
| 244 | } | ||
| 245 | |||
| 246 |
2/2✓ Branch 0 taken 8 times.
✓ Branch 1 taken 147 times.
|
155 | if (cur_tag->face[0]) { |
| 247 |
1/2✓ Branch 0 taken 8 times.
✗ Branch 1 not taken.
|
8 | if (!last_tag->face[0]) |
| 248 | 8 | av_bprintf(dst, "{\\fn}"); | |
| 249 | ✗ | else if (strcmp(last_tag->face, cur_tag->face)) | |
| 250 | ✗ | av_bprintf(dst, "{\\fn%s}", last_tag->face); | |
| 251 | } | ||
| 252 |
2/4✓ Branch 0 taken 155 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 155 times.
✗ Branch 3 not taken.
|
155 | } else if (!tag_close && sptr < FF_ARRAY_ELEMS(stack) - 1) { |
| 253 | 155 | struct font_tag *new_tag = &stack[sptr + 1]; | |
| 254 | |||
| 255 | 155 | *new_tag = stack[sptr++]; | |
| 256 | |||
| 257 |
2/2✓ Branch 0 taken 191 times.
✓ Branch 1 taken 155 times.
|
346 | while (param) { |
| 258 |
2/2✓ Branch 1 taken 85 times.
✓ Branch 2 taken 106 times.
|
191 | if (!av_strncasecmp(param, "size=", 5)) { |
| 259 |
2/2✓ Branch 0 taken 32 times.
✓ Branch 1 taken 53 times.
|
85 | param += 5 + (param[5] == '"'); |
| 260 |
1/2✓ Branch 0 taken 85 times.
✗ Branch 1 not taken.
|
85 | if (sscanf(param, "%u", &new_tag->size) == 1) |
| 261 | 85 | av_bprintf(dst, "{\\fs%u}", new_tag->size); | |
| 262 |
2/2✓ Branch 1 taken 97 times.
✓ Branch 2 taken 9 times.
|
106 | } else if (!av_strncasecmp(param, "color=", 6)) { |
| 263 | int color; | ||
| 264 |
2/2✓ Branch 0 taken 66 times.
✓ Branch 1 taken 31 times.
|
97 | param += 6 + (param[6] == '"'); |
| 265 | 97 | color = html_color_parse(log_ctx, param); | |
| 266 |
1/2✓ Branch 0 taken 97 times.
✗ Branch 1 not taken.
|
97 | if (color >= 0) { |
| 267 | 97 | new_tag->color = 0xff000000 | color; | |
| 268 | 97 | av_bprintf(dst, "{\\c&H%"PRIX32"&}", new_tag->color & 0xffffff); | |
| 269 | } | ||
| 270 |
2/2✓ Branch 1 taken 8 times.
✓ Branch 2 taken 1 times.
|
9 | } else if (!av_strncasecmp(param, "face=", 5)) { |
| 271 |
1/2✓ Branch 0 taken 8 times.
✗ Branch 1 not taken.
|
8 | param += 5 + (param[5] == '"'); |
| 272 | 8 | len = strcspn(param, | |
| 273 |
1/2✓ Branch 0 taken 8 times.
✗ Branch 1 not taken.
|
8 | param[-1] == '"' ? "\"" :" "); |
| 274 | 8 | av_strlcpy(new_tag->face, param, | |
| 275 |
1/2✓ Branch 0 taken 8 times.
✗ Branch 1 not taken.
|
8 | FFMIN(sizeof(new_tag->face), len+1)); |
| 276 | 8 | param += len; | |
| 277 | 8 | av_bprintf(dst, "{\\fn%s}", new_tag->face); | |
| 278 | } | ||
| 279 |
2/2✓ Branch 0 taken 36 times.
✓ Branch 1 taken 155 times.
|
191 | if ((param = strchr(param, ' '))) |
| 280 | 36 | param++; | |
| 281 | } | ||
| 282 | } | ||
| 283 | 310 | in += skip; | |
| 284 |
6/6✓ Branch 0 taken 427 times.
✓ Branch 1 taken 1 times.
✓ Branch 2 taken 258 times.
✓ Branch 3 taken 169 times.
✓ Branch 4 taken 256 times.
✓ Branch 5 taken 2 times.
|
428 | } else if (tagname[0] && !tagname[1] && strchr("bisu", av_tolower(tagname[0]))) { |
| 285 | 256 | av_bprintf(dst, "{\\%c%d}", (char)av_tolower(tagname[0]), !tag_close); | |
| 286 | 256 | in += skip; | |
| 287 |
2/2✓ Branch 1 taken 6 times.
✓ Branch 2 taken 166 times.
|
172 | } else if (!av_strncasecmp(tagname, "br", 2) && |
| 288 |
5/6✓ Branch 0 taken 3 times.
✓ Branch 1 taken 3 times.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 2 times.
✓ Branch 4 taken 1 times.
✗ Branch 5 not taken.
|
6 | (!tagname[2] || (tagname[2] == '/' && !tagname[3]))) { |
| 289 | 4 | av_bprintf(dst, "\\N"); | |
| 290 | 4 | in += skip; | |
| 291 |
2/2✓ Branch 0 taken 151 times.
✓ Branch 1 taken 17 times.
|
168 | } else if (likely_a_tag) { |
| 292 |
2/2✓ Branch 0 taken 81 times.
✓ Branch 1 taken 70 times.
|
151 | if (!tag_close) // warn only once |
| 293 | 81 | av_log(log_ctx, AV_LOG_WARNING, "Unrecognized tag %s\n", tagname); | |
| 294 | 151 | in += skip; | |
| 295 | } else { | ||
| 296 | 17 | av_bprint_chars(dst, '<', 1); | |
| 297 | } | ||
| 298 | } else { | ||
| 299 | 24 | av_bprint_chars(dst, *in, 1); | |
| 300 | } | ||
| 301 | 762 | break; | |
| 302 | 27401 | default: | |
| 303 | 27401 | av_bprint_chars(dst, *in, 1); | |
| 304 | 27401 | break; | |
| 305 | } | ||
| 306 |
5/6✓ Branch 0 taken 28814 times.
✓ Branch 1 taken 4783 times.
✓ Branch 2 taken 28814 times.
✗ Branch 3 not taken.
✓ Branch 4 taken 28291 times.
✓ Branch 5 taken 523 times.
|
33597 | if (*in != ' ' && *in != '\r' && *in != '\n') |
| 307 | 28291 | line_start = 0; | |
| 308 | } | ||
| 309 | |||
| 310 |
1/2✗ Branch 1 not taken.
✓ Branch 2 taken 411 times.
|
411 | if (!av_bprint_is_complete(dst)) |
| 311 | ✗ | return AVERROR(ENOMEM); | |
| 312 | |||
| 313 |
3/4✓ Branch 0 taken 410 times.
✓ Branch 1 taken 1 times.
✗ Branch 2 not taken.
✓ Branch 3 taken 410 times.
|
411 | while (dst->len >= 2 && !strncmp(&dst->str[dst->len - 2], "\\N", 2)) |
| 314 | ✗ | dst->len -= 2; | |
| 315 | 411 | dst->str[dst->len] = 0; | |
| 316 | 411 | rstrip_spaces_buf(dst); | |
| 317 | |||
| 318 | 411 | return 0; | |
| 319 | } | ||
| 320 |