/*
* Copyright (C) 2016 The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package androidx.media3.extractor.text.webvtt;
import static java.lang.Math.min;
import static java.lang.annotation.RetentionPolicy.SOURCE;
import android.graphics.Color;
import android.graphics.Typeface;
import android.text.Layout;
import android.text.SpannableStringBuilder;
import android.text.Spanned;
import android.text.SpannedString;
import android.text.TextUtils;
import android.text.style.AbsoluteSizeSpan;
import android.text.style.BackgroundColorSpan;
import android.text.style.ForegroundColorSpan;
import android.text.style.RelativeSizeSpan;
import android.text.style.StrikethroughSpan;
import android.text.style.StyleSpan;
import android.text.style.TypefaceSpan;
import android.text.style.UnderlineSpan;
import androidx.annotation.IntDef;
import androidx.annotation.Nullable;
import androidx.media3.common.text.Cue;
import androidx.media3.common.text.HorizontalTextInVerticalContextSpan;
import androidx.media3.common.text.RubySpan;
import androidx.media3.common.text.SpanUtil;
import androidx.media3.common.text.TextAnnotation;
import androidx.media3.common.util.Assertions;
import androidx.media3.common.util.Log;
import androidx.media3.common.util.ParsableByteArray;
import androidx.media3.common.util.UnstableApi;
import androidx.media3.common.util.Util;
import java.lang.annotation.Documented;
import java.lang.annotation.Retention;
import java.util.ArrayDeque;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.checkerframework.checker.nullness.qual.MonotonicNonNull;
/** Parser for WebVTT cues. (https://w3c.github.io/webvtt/#cues) */
@UnstableApi
public final class WebvttCueParser {
/**
* Valid values for {@link WebvttCueInfoBuilder#textAlignment}.
*
*
We use a custom list (and not {@link Layout.Alignment} directly) in order to include both
* {@code START}/{@code LEFT} and {@code END}/{@code RIGHT}. The distinction is important for
* {@link WebvttCueInfoBuilder#derivePosition(int)}.
*
*
These correspond to the valid values for the 'align' cue setting in the WebVTT spec.
*/
@Documented
@Retention(SOURCE)
@IntDef({
TEXT_ALIGNMENT_START,
TEXT_ALIGNMENT_CENTER,
TEXT_ALIGNMENT_END,
TEXT_ALIGNMENT_LEFT,
TEXT_ALIGNMENT_RIGHT
})
private @interface TextAlignment {}
/**
* See WebVTT's align:start.
*/
private static final int TEXT_ALIGNMENT_START = 1;
/**
* See WebVTT's align:center.
*/
private static final int TEXT_ALIGNMENT_CENTER = 2;
/**
* See WebVTT's align:end.
*/
private static final int TEXT_ALIGNMENT_END = 3;
/**
* See WebVTT's align:left.
*/
private static final int TEXT_ALIGNMENT_LEFT = 4;
/**
* See WebVTT's align:right.
*/
private static final int TEXT_ALIGNMENT_RIGHT = 5;
public static final Pattern CUE_HEADER_PATTERN =
Pattern.compile("^(\\S+)\\s+-->\\s+(\\S+)(.*)?$");
private static final Pattern CUE_SETTING_PATTERN = Pattern.compile("(\\S+?):(\\S+)");
private static final char CHAR_LESS_THAN = '<';
private static final char CHAR_GREATER_THAN = '>';
private static final char CHAR_SLASH = '/';
private static final char CHAR_AMPERSAND = '&';
private static final char CHAR_SEMI_COLON = ';';
private static final char CHAR_SPACE = ' ';
private static final String ENTITY_LESS_THAN = "lt";
private static final String ENTITY_GREATER_THAN = "gt";
private static final String ENTITY_AMPERSAND = "amp";
private static final String ENTITY_NON_BREAK_SPACE = "nbsp";
private static final String TAG_BOLD = "b";
private static final String TAG_CLASS = "c";
private static final String TAG_ITALIC = "i";
private static final String TAG_LANG = "lang";
private static final String TAG_RUBY = "ruby";
private static final String TAG_RUBY_TEXT = "rt";
private static final String TAG_UNDERLINE = "u";
private static final String TAG_VOICE = "v";
private static final int STYLE_BOLD = Typeface.BOLD;
private static final int STYLE_ITALIC = Typeface.ITALIC;
/* package */ static final float DEFAULT_POSITION = 0.5f;
private static final String TAG = "WebvttCueParser";
/**
* See WebVTT's default text
* colors.
*/
private static final Map DEFAULT_TEXT_COLORS;
static {
Map defaultColors = new HashMap<>();
defaultColors.put("white", Color.rgb(255, 255, 255));
defaultColors.put("lime", Color.rgb(0, 255, 0));
defaultColors.put("cyan", Color.rgb(0, 255, 255));
defaultColors.put("red", Color.rgb(255, 0, 0));
defaultColors.put("yellow", Color.rgb(255, 255, 0));
defaultColors.put("magenta", Color.rgb(255, 0, 255));
defaultColors.put("blue", Color.rgb(0, 0, 255));
defaultColors.put("black", Color.rgb(0, 0, 0));
DEFAULT_TEXT_COLORS = Collections.unmodifiableMap(defaultColors);
}
/**
* See WebVTT's default text
* background colors.
*/
private static final Map DEFAULT_BACKGROUND_COLORS;
static {
Map defaultBackgroundColors = new HashMap<>();
defaultBackgroundColors.put("bg_white", Color.rgb(255, 255, 255));
defaultBackgroundColors.put("bg_lime", Color.rgb(0, 255, 0));
defaultBackgroundColors.put("bg_cyan", Color.rgb(0, 255, 255));
defaultBackgroundColors.put("bg_red", Color.rgb(255, 0, 0));
defaultBackgroundColors.put("bg_yellow", Color.rgb(255, 255, 0));
defaultBackgroundColors.put("bg_magenta", Color.rgb(255, 0, 255));
defaultBackgroundColors.put("bg_blue", Color.rgb(0, 0, 255));
defaultBackgroundColors.put("bg_black", Color.rgb(0, 0, 0));
DEFAULT_BACKGROUND_COLORS = Collections.unmodifiableMap(defaultBackgroundColors);
}
/**
* Parses the next valid WebVTT cue in a parsable array, including timestamps, settings and text.
*
* @param webvttData Parsable WebVTT file data.
* @param styles List of styles defined by the CSS style blocks preceding the cues.
* @return The parsed cue info, or null if no valid cue was found.
*/
@Nullable
public static WebvttCueInfo parseCue(ParsableByteArray webvttData, List styles) {
@Nullable String firstLine = webvttData.readLine();
if (firstLine == null) {
return null;
}
Matcher cueHeaderMatcher = WebvttCueParser.CUE_HEADER_PATTERN.matcher(firstLine);
if (cueHeaderMatcher.matches()) {
// We have found the timestamps in the first line. No id present.
return parseCue(null, cueHeaderMatcher, webvttData, styles);
}
// The first line is not the timestamps, but could be the cue id.
@Nullable String secondLine = webvttData.readLine();
if (secondLine == null) {
return null;
}
cueHeaderMatcher = WebvttCueParser.CUE_HEADER_PATTERN.matcher(secondLine);
if (cueHeaderMatcher.matches()) {
// We can do the rest of the parsing, including the id.
return parseCue(firstLine.trim(), cueHeaderMatcher, webvttData, styles);
}
return null;
}
/**
* Parses a string containing a list of cue settings.
*
* @param cueSettingsList String containing the settings for a given cue.
* @return The cue settings parsed into a {@link Cue.Builder}.
*/
/* package */ static Cue.Builder parseCueSettingsList(String cueSettingsList) {
WebvttCueInfoBuilder builder = new WebvttCueInfoBuilder();
parseCueSettingsList(cueSettingsList, builder);
return builder.toCueBuilder();
}
/** Create a new {@link Cue} containing {@code text} and with WebVTT default values. */
/* package */ static Cue newCueForText(CharSequence text) {
WebvttCueInfoBuilder infoBuilder = new WebvttCueInfoBuilder();
infoBuilder.text = text;
return infoBuilder.toCueBuilder().build();
}
/**
* Parses the text payload of a WebVTT Cue and returns it as a styled {@link SpannedString}.
*
* @param id ID of the cue, {@code null} if it is not present.
* @param markup The markup text to be parsed.
* @param styles List of styles defined by the CSS style blocks preceding the cues.
* @return The styled cue text.
*/
/* package */ static SpannedString parseCueText(
@Nullable String id, String markup, List styles) {
SpannableStringBuilder spannedText = new SpannableStringBuilder();
ArrayDeque startTagStack = new ArrayDeque<>();
int pos = 0;
List nestedElements = new ArrayList<>();
while (pos < markup.length()) {
char curr = markup.charAt(pos);
switch (curr) {
case CHAR_LESS_THAN:
if (pos + 1 >= markup.length()) {
pos++;
break; // avoid ArrayOutOfBoundsException
}
int ltPos = pos;
boolean isClosingTag = markup.charAt(ltPos + 1) == CHAR_SLASH;
pos = findEndOfTag(markup, ltPos + 1);
boolean isVoidTag = markup.charAt(pos - 2) == CHAR_SLASH;
String fullTagExpression =
markup.substring(ltPos + (isClosingTag ? 2 : 1), isVoidTag ? pos - 2 : pos - 1);
if (fullTagExpression.trim().isEmpty()) {
continue;
}
String tagName = getTagName(fullTagExpression);
if (!isSupportedTag(tagName)) {
continue;
}
if (isClosingTag) {
StartTag startTag;
do {
if (startTagStack.isEmpty()) {
break;
}
startTag = startTagStack.pop();
applySpansForTag(id, startTag, nestedElements, spannedText, styles);
if (!startTagStack.isEmpty()) {
nestedElements.add(new Element(startTag, spannedText.length()));
} else {
nestedElements.clear();
}
} while (!startTag.name.equals(tagName));
} else if (!isVoidTag) {
startTagStack.push(StartTag.buildStartTag(fullTagExpression, spannedText.length()));
}
break;
case CHAR_AMPERSAND:
int semiColonEndIndex = markup.indexOf(CHAR_SEMI_COLON, pos + 1);
int spaceEndIndex = markup.indexOf(CHAR_SPACE, pos + 1);
int entityEndIndex =
semiColonEndIndex == -1
? spaceEndIndex
: (spaceEndIndex == -1
? semiColonEndIndex
: min(semiColonEndIndex, spaceEndIndex));
if (entityEndIndex != -1) {
applyEntity(markup.substring(pos + 1, entityEndIndex), spannedText);
if (entityEndIndex == spaceEndIndex) {
spannedText.append(" ");
}
pos = entityEndIndex + 1;
} else {
spannedText.append(curr);
pos++;
}
break;
default:
spannedText.append(curr);
pos++;
break;
}
}
// apply unclosed tags
while (!startTagStack.isEmpty()) {
applySpansForTag(id, startTagStack.pop(), nestedElements, spannedText, styles);
}
applySpansForTag(
id,
StartTag.buildWholeCueVirtualTag(),
/* nestedElements= */ Collections.emptyList(),
spannedText,
styles);
return SpannedString.valueOf(spannedText);
}
// Internal methods
@Nullable
private static WebvttCueInfo parseCue(
@Nullable String id,
Matcher cueHeaderMatcher,
ParsableByteArray webvttData,
List styles) {
WebvttCueInfoBuilder builder = new WebvttCueInfoBuilder();
try {
// Parse the cue start and end times.
builder.startTimeUs =
WebvttParserUtil.parseTimestampUs(Assertions.checkNotNull(cueHeaderMatcher.group(1)));
builder.endTimeUs =
WebvttParserUtil.parseTimestampUs(Assertions.checkNotNull(cueHeaderMatcher.group(2)));
} catch (NumberFormatException e) {
Log.w(TAG, "Skipping cue with bad header: " + cueHeaderMatcher.group());
return null;
}
parseCueSettingsList(Assertions.checkNotNull(cueHeaderMatcher.group(3)), builder);
// Parse the cue text.
StringBuilder textBuilder = new StringBuilder();
for (String line = webvttData.readLine();
!TextUtils.isEmpty(line);
line = webvttData.readLine()) {
if (textBuilder.length() > 0) {
textBuilder.append("\n");
}
textBuilder.append(line.trim());
}
builder.text = parseCueText(id, textBuilder.toString(), styles);
return builder.build();
}
private static void parseCueSettingsList(String cueSettingsList, WebvttCueInfoBuilder builder) {
// Parse the cue settings list.
Matcher cueSettingMatcher = CUE_SETTING_PATTERN.matcher(cueSettingsList);
while (cueSettingMatcher.find()) {
String name = Assertions.checkNotNull(cueSettingMatcher.group(1));
String value = Assertions.checkNotNull(cueSettingMatcher.group(2));
try {
if ("line".equals(name)) {
parseLineAttribute(value, builder);
} else if ("align".equals(name)) {
builder.textAlignment = parseTextAlignment(value);
} else if ("position".equals(name)) {
parsePositionAttribute(value, builder);
} else if ("size".equals(name)) {
builder.size = WebvttParserUtil.parsePercentage(value);
} else if ("vertical".equals(name)) {
builder.verticalType = parseVerticalAttribute(value);
} else {
Log.w(TAG, "Unknown cue setting " + name + ":" + value);
}
} catch (NumberFormatException e) {
Log.w(TAG, "Skipping bad cue setting: " + cueSettingMatcher.group());
}
}
}
private static void parseLineAttribute(String s, WebvttCueInfoBuilder builder) {
int commaIndex = s.indexOf(',');
if (commaIndex != -1) {
builder.lineAnchor = parseLineAnchor(s.substring(commaIndex + 1));
s = s.substring(0, commaIndex);
}
if (s.endsWith("%")) {
builder.line = WebvttParserUtil.parsePercentage(s);
builder.lineType = Cue.LINE_TYPE_FRACTION;
} else {
builder.line = Integer.parseInt(s);
builder.lineType = Cue.LINE_TYPE_NUMBER;
}
}
@Cue.AnchorType
private static int parseLineAnchor(String s) {
switch (s) {
case "start":
return Cue.ANCHOR_TYPE_START;
case "center":
case "middle":
return Cue.ANCHOR_TYPE_MIDDLE;
case "end":
return Cue.ANCHOR_TYPE_END;
default:
Log.w(TAG, "Invalid anchor value: " + s);
return Cue.TYPE_UNSET;
}
}
private static void parsePositionAttribute(String s, WebvttCueInfoBuilder builder) {
int commaIndex = s.indexOf(',');
if (commaIndex != -1) {
builder.positionAnchor = parsePositionAnchor(s.substring(commaIndex + 1));
s = s.substring(0, commaIndex);
}
builder.position = WebvttParserUtil.parsePercentage(s);
}
@Cue.AnchorType
private static int parsePositionAnchor(String s) {
switch (s) {
case "line-left":
case "start":
return Cue.ANCHOR_TYPE_START;
case "center":
case "middle":
return Cue.ANCHOR_TYPE_MIDDLE;
case "line-right":
case "end":
return Cue.ANCHOR_TYPE_END;
default:
Log.w(TAG, "Invalid anchor value: " + s);
return Cue.TYPE_UNSET;
}
}
@Cue.VerticalType
private static int parseVerticalAttribute(String s) {
switch (s) {
case "rl":
return Cue.VERTICAL_TYPE_RL;
case "lr":
return Cue.VERTICAL_TYPE_LR;
default:
Log.w(TAG, "Invalid 'vertical' value: " + s);
return Cue.TYPE_UNSET;
}
}
@TextAlignment
private static int parseTextAlignment(String s) {
switch (s) {
case "start":
return TEXT_ALIGNMENT_START;
case "left":
return TEXT_ALIGNMENT_LEFT;
case "center":
case "middle":
return TEXT_ALIGNMENT_CENTER;
case "end":
return TEXT_ALIGNMENT_END;
case "right":
return TEXT_ALIGNMENT_RIGHT;
default:
Log.w(TAG, "Invalid alignment value: " + s);
// Default value: https://www.w3.org/TR/webvtt1/#webvtt-cue-text-alignment
return TEXT_ALIGNMENT_CENTER;
}
}
/**
* Find end of tag (>). The position returned is the position of the > plus one (exclusive).
*
* @param markup The WebVTT cue markup to be parsed.
* @param startPos The position from where to start searching for the end of tag.
* @return The position of the end of tag plus 1 (one).
*/
private static int findEndOfTag(String markup, int startPos) {
int index = markup.indexOf(CHAR_GREATER_THAN, startPos);
return index == -1 ? markup.length() : index + 1;
}
private static void applyEntity(String entity, SpannableStringBuilder spannedText) {
switch (entity) {
case ENTITY_LESS_THAN:
spannedText.append('<');
break;
case ENTITY_GREATER_THAN:
spannedText.append('>');
break;
case ENTITY_NON_BREAK_SPACE:
spannedText.append(' ');
break;
case ENTITY_AMPERSAND:
spannedText.append('&');
break;
default:
Log.w(TAG, "ignoring unsupported entity: '&" + entity + ";'");
break;
}
}
private static boolean isSupportedTag(String tagName) {
switch (tagName) {
case TAG_BOLD:
case TAG_CLASS:
case TAG_ITALIC:
case TAG_LANG:
case TAG_RUBY:
case TAG_RUBY_TEXT:
case TAG_UNDERLINE:
case TAG_VOICE:
return true;
default:
return false;
}
}
private static void applySpansForTag(
@Nullable String cueId,
StartTag startTag,
List nestedElements,
SpannableStringBuilder text,
List styles) {
int start = startTag.position;
int end = text.length();
switch (startTag.name) {
case TAG_BOLD:
text.setSpan(new StyleSpan(STYLE_BOLD), start, end, Spanned.SPAN_EXCLUSIVE_EXCLUSIVE);
break;
case TAG_ITALIC:
text.setSpan(new StyleSpan(STYLE_ITALIC), start, end, Spanned.SPAN_EXCLUSIVE_EXCLUSIVE);
break;
case TAG_RUBY:
applyRubySpans(text, cueId, startTag, nestedElements, styles);
break;
case TAG_UNDERLINE:
text.setSpan(new UnderlineSpan(), start, end, Spanned.SPAN_EXCLUSIVE_EXCLUSIVE);
break;
case TAG_CLASS:
applyDefaultColors(text, startTag.classes, start, end);
break;
case TAG_LANG:
case TAG_VOICE:
case "": // Case of the "whole cue" virtual tag.
break;
default:
return;
}
List applicableStyles = getApplicableStyles(styles, cueId, startTag);
for (int i = 0; i < applicableStyles.size(); i++) {
applyStyleToText(text, applicableStyles.get(i).style, start, end);
}
}
private static void applyRubySpans(
SpannableStringBuilder text,
@Nullable String cueId,
StartTag startTag,
List nestedElements,
List styles) {
@TextAnnotation.Position int rubyTagPosition = getRubyPosition(styles, cueId, startTag);
List sortedNestedElements = new ArrayList<>(nestedElements.size());
sortedNestedElements.addAll(nestedElements);
Collections.sort(sortedNestedElements, Element.BY_START_POSITION_ASC);
int deletedCharCount = 0;
int lastRubyTextEnd = startTag.position;
for (int i = 0; i < sortedNestedElements.size(); i++) {
if (!TAG_RUBY_TEXT.equals(sortedNestedElements.get(i).startTag.name)) {
continue;
}
Element rubyTextElement = sortedNestedElements.get(i);
// Use the