Commit eb51ad57 by Ian Baker Committed by christosts

Merge pull request #10750 from Stronger197:subrip_utf_16

PiperOrigin-RevId: 492164739
(cherry picked from commit 496cfa42)
parent 2bfced9b
...@@ -17,6 +17,9 @@ package com.google.android.exoplayer2.util; ...@@ -17,6 +17,9 @@ package com.google.android.exoplayer2.util;
import androidx.annotation.Nullable; import androidx.annotation.Nullable;
import com.google.common.base.Charsets; import com.google.common.base.Charsets;
import com.google.common.collect.ImmutableSet;
import com.google.common.primitives.Chars;
import com.google.common.primitives.UnsignedBytes;
import java.nio.ByteBuffer; import java.nio.ByteBuffer;
import java.nio.charset.Charset; import java.nio.charset.Charset;
import java.util.Arrays; import java.util.Arrays;
...@@ -27,6 +30,12 @@ import java.util.Arrays; ...@@ -27,6 +30,12 @@ import java.util.Arrays;
*/ */
public final class ParsableByteArray { public final class ParsableByteArray {
private static final char[] CR_AND_LF = {'\r', '\n'};
private static final char[] LF = {'\n'};
private static final ImmutableSet<Charset> SUPPORTED_CHARSETS_FOR_READLINE =
ImmutableSet.of(
Charsets.US_ASCII, Charsets.UTF_8, Charsets.UTF_16, Charsets.UTF_16BE, Charsets.UTF_16LE);
private byte[] data; private byte[] data;
private int position; private int position;
// TODO(internal b/147657250): Enforce this limit on all read methods. // TODO(internal b/147657250): Enforce this limit on all read methods.
...@@ -489,45 +498,47 @@ public final class ParsableByteArray { ...@@ -489,45 +498,47 @@ public final class ParsableByteArray {
} }
/** /**
* Reads a line of text. * Reads a line of text in UTF-8.
*
* <p>Equivalent to passing {@link Charsets#UTF_8} to {@link #readLine(Charset)}.
*/
@Nullable
public String readLine() {
return readLine(Charsets.UTF_8);
}
/**
* Reads a line of text in {@code charset}.
* *
* <p>A line is considered to be terminated by any one of a carriage return ('\r'), a line feed * <p>A line is considered to be terminated by any one of a carriage return ('\r'), a line feed
* ('\n'), or a carriage return followed immediately by a line feed ('\r\n'). The UTF-8 charset is * ('\n'), or a carriage return followed immediately by a line feed ('\r\n'). This method discards
* used. This method discards leading UTF-8 byte order marks, if present. * leading UTF byte order marks (BOM), if present.
*
* <p>The {@linkplain #getPosition() position} is advanced to start of the next line (i.e. any
* line terminators are skipped).
* *
* @param charset The charset used to interpret the bytes as a {@link String}.
* @return The line not including any line-termination characters, or null if the end of the data * @return The line not including any line-termination characters, or null if the end of the data
* has already been reached. * has already been reached.
* @throws IllegalArgumentException if charset is not supported. Only US_ASCII, UTF-8, UTF-16,
* UTF-16BE, and UTF-16LE are supported.
*/ */
@Nullable @Nullable
public String readLine() { public String readLine(Charset charset) {
Assertions.checkArgument(
SUPPORTED_CHARSETS_FOR_READLINE.contains(charset), "Unsupported charset: " + charset);
if (bytesLeft() == 0) { if (bytesLeft() == 0) {
return null; return null;
} }
int lineLimit = position; if (!charset.equals(Charsets.US_ASCII)) {
while (lineLimit < limit && !Util.isLinebreak(data[lineLimit])) { readUtfCharsetFromBom(); // Skip BOM if present
lineLimit++;
} }
if (lineLimit - position >= 3 int lineLimit = findNextLineTerminator(charset);
&& data[position] == (byte) 0xEF String line = readString(lineLimit - position, charset);
&& data[position + 1] == (byte) 0xBB
&& data[position + 2] == (byte) 0xBF) {
// There's a UTF-8 byte order mark at the start of the line. Discard it.
position += 3;
}
String line = Util.fromUtf8Bytes(data, position, lineLimit - position);
position = lineLimit;
if (position == limit) { if (position == limit) {
return line; return line;
} }
if (data[position] == '\r') { skipLineTerminator(charset);
position++;
if (position == limit) {
return line;
}
}
if (data[position] == '\n') {
position++;
}
return line; return line;
} }
...@@ -565,4 +576,99 @@ public final class ParsableByteArray { ...@@ -565,4 +576,99 @@ public final class ParsableByteArray {
position += length; position += length;
return value; return value;
} }
/**
* Reads a UTF byte order mark (BOM) and returns the UTF {@link Charset} it represents. Returns
* {@code null} without advancing {@link #getPosition() position} if no BOM is found.
*/
@Nullable
public Charset readUtfCharsetFromBom() {
if (bytesLeft() >= 3
&& data[position] == (byte) 0xEF
&& data[position + 1] == (byte) 0xBB
&& data[position + 2] == (byte) 0xBF) {
position += 3;
return Charsets.UTF_8;
} else if (bytesLeft() >= 2) {
if (data[position] == (byte) 0xFE && data[position + 1] == (byte) 0xFF) {
position += 2;
return Charsets.UTF_16BE;
} else if (data[position] == (byte) 0xFF && data[position + 1] == (byte) 0xFE) {
position += 2;
return Charsets.UTF_16LE;
}
}
return null;
}
/**
* Returns the index of the next occurrence of '\n' or '\r', or {@link #limit} if none is found.
*/
private int findNextLineTerminator(Charset charset) {
int stride;
if (charset.equals(Charsets.UTF_8) || charset.equals(Charsets.US_ASCII)) {
stride = 1;
} else if (charset.equals(Charsets.UTF_16)
|| charset.equals(Charsets.UTF_16LE)
|| charset.equals(Charsets.UTF_16BE)) {
stride = 2;
} else {
throw new IllegalArgumentException("Unsupported charset: " + charset);
}
for (int i = position; i < limit - (stride - 1); i += stride) {
if ((charset.equals(Charsets.UTF_8) || charset.equals(Charsets.US_ASCII))
&& Util.isLinebreak(data[i])) {
return i;
} else if ((charset.equals(Charsets.UTF_16) || charset.equals(Charsets.UTF_16BE))
&& data[i] == 0x00
&& Util.isLinebreak(data[i + 1])) {
return i;
} else if (charset.equals(Charsets.UTF_16LE)
&& data[i + 1] == 0x00
&& Util.isLinebreak(data[i])) {
return i;
}
}
return limit;
}
private void skipLineTerminator(Charset charset) {
if (readCharacterIfInList(charset, CR_AND_LF) == '\r') {
readCharacterIfInList(charset, LF);
}
}
/**
* Peeks at the character at {@link #position} (as decoded by {@code charset}), returns it and
* advances {@link #position} past it if it's in {@code chars}, otherwise returns {@code 0}
* without advancing {@link #position}. Returns {@code 0} if {@link #bytesLeft()} doesn't allow
* reading a whole character in {@code charset}.
*
* <p>Only supports characters in {@code chars} that occupy a single code unit (i.e. one byte for
* UTF-8 and two bytes for UTF-16).
*/
private char readCharacterIfInList(Charset charset, char[] chars) {
char character;
int characterSize;
if ((charset.equals(Charsets.UTF_8) || charset.equals(Charsets.US_ASCII)) && bytesLeft() >= 1) {
character = Chars.checkedCast(UnsignedBytes.toInt(data[position]));
characterSize = 1;
} else if ((charset.equals(Charsets.UTF_16) || charset.equals(Charsets.UTF_16BE))
&& bytesLeft() >= 2) {
character = Chars.fromBytes(data[position], data[position + 1]);
characterSize = 2;
} else if (charset.equals(Charsets.UTF_16LE) && bytesLeft() >= 2) {
character = Chars.fromBytes(data[position + 1], data[position]);
characterSize = 2;
} else {
return 0;
}
if (Chars.contains(chars, character)) {
position += characterSize;
return Chars.checkedCast(character);
} else {
return 0;
}
}
} }
...@@ -26,6 +26,8 @@ import com.google.android.exoplayer2.util.Assertions; ...@@ -26,6 +26,8 @@ import com.google.android.exoplayer2.util.Assertions;
import com.google.android.exoplayer2.util.Log; import com.google.android.exoplayer2.util.Log;
import com.google.android.exoplayer2.util.LongArray; import com.google.android.exoplayer2.util.LongArray;
import com.google.android.exoplayer2.util.ParsableByteArray; import com.google.android.exoplayer2.util.ParsableByteArray;
import com.google.common.base.Charsets;
import java.nio.charset.Charset;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.regex.Matcher; import java.util.regex.Matcher;
import java.util.regex.Pattern; import java.util.regex.Pattern;
...@@ -74,9 +76,10 @@ public final class SubripDecoder extends SimpleSubtitleDecoder { ...@@ -74,9 +76,10 @@ public final class SubripDecoder extends SimpleSubtitleDecoder {
ArrayList<Cue> cues = new ArrayList<>(); ArrayList<Cue> cues = new ArrayList<>();
LongArray cueTimesUs = new LongArray(); LongArray cueTimesUs = new LongArray();
ParsableByteArray subripData = new ParsableByteArray(data, length); ParsableByteArray subripData = new ParsableByteArray(data, length);
Charset charset = detectUtfCharset(subripData);
@Nullable String currentLine; @Nullable String currentLine;
while ((currentLine = subripData.readLine()) != null) { while ((currentLine = subripData.readLine(charset)) != null) {
if (currentLine.length() == 0) { if (currentLine.length() == 0) {
// Skip blank lines. // Skip blank lines.
continue; continue;
...@@ -91,7 +94,7 @@ public final class SubripDecoder extends SimpleSubtitleDecoder { ...@@ -91,7 +94,7 @@ public final class SubripDecoder extends SimpleSubtitleDecoder {
} }
// Read and parse the timing line. // Read and parse the timing line.
currentLine = subripData.readLine(); currentLine = subripData.readLine(charset);
if (currentLine == null) { if (currentLine == null) {
Log.w(TAG, "Unexpected end"); Log.w(TAG, "Unexpected end");
break; break;
...@@ -109,13 +112,13 @@ public final class SubripDecoder extends SimpleSubtitleDecoder { ...@@ -109,13 +112,13 @@ public final class SubripDecoder extends SimpleSubtitleDecoder {
// Read and parse the text and tags. // Read and parse the text and tags.
textBuilder.setLength(0); textBuilder.setLength(0);
tags.clear(); tags.clear();
currentLine = subripData.readLine(); currentLine = subripData.readLine(charset);
while (!TextUtils.isEmpty(currentLine)) { while (!TextUtils.isEmpty(currentLine)) {
if (textBuilder.length() > 0) { if (textBuilder.length() > 0) {
textBuilder.append("<br>"); textBuilder.append("<br>");
} }
textBuilder.append(processLine(currentLine, tags)); textBuilder.append(processLine(currentLine, tags));
currentLine = subripData.readLine(); currentLine = subripData.readLine(charset);
} }
Spanned text = Html.fromHtml(textBuilder.toString()); Spanned text = Html.fromHtml(textBuilder.toString());
...@@ -139,6 +142,15 @@ public final class SubripDecoder extends SimpleSubtitleDecoder { ...@@ -139,6 +142,15 @@ public final class SubripDecoder extends SimpleSubtitleDecoder {
} }
/** /**
* Determine UTF encoding of the byte array from a byte order mark (BOM), defaulting to UTF-8 if
* no BOM is found.
*/
private Charset detectUtfCharset(ParsableByteArray data) {
@Nullable Charset charset = data.readUtfCharsetFromBom();
return charset != null ? charset : Charsets.UTF_8;
}
/**
* Trims and removes tags from the given line. The removed tags are added to {@code tags}. * Trims and removes tags from the given line. The removed tags are added to {@code tags}.
* *
* @param line The line to process. * @param line The line to process.
......
...@@ -26,6 +26,7 @@ import android.text.style.ForegroundColorSpan; ...@@ -26,6 +26,7 @@ import android.text.style.ForegroundColorSpan;
import android.text.style.StyleSpan; import android.text.style.StyleSpan;
import android.text.style.TypefaceSpan; import android.text.style.TypefaceSpan;
import android.text.style.UnderlineSpan; import android.text.style.UnderlineSpan;
import androidx.annotation.Nullable;
import com.google.android.exoplayer2.C; import com.google.android.exoplayer2.C;
import com.google.android.exoplayer2.text.Cue; import com.google.android.exoplayer2.text.Cue;
import com.google.android.exoplayer2.text.SimpleSubtitleDecoder; import com.google.android.exoplayer2.text.SimpleSubtitleDecoder;
...@@ -35,6 +36,7 @@ import com.google.android.exoplayer2.util.Log; ...@@ -35,6 +36,7 @@ import com.google.android.exoplayer2.util.Log;
import com.google.android.exoplayer2.util.ParsableByteArray; import com.google.android.exoplayer2.util.ParsableByteArray;
import com.google.android.exoplayer2.util.Util; import com.google.android.exoplayer2.util.Util;
import com.google.common.base.Charsets; import com.google.common.base.Charsets;
import java.nio.charset.Charset;
import java.util.List; import java.util.List;
/** /**
...@@ -46,16 +48,12 @@ public final class Tx3gDecoder extends SimpleSubtitleDecoder { ...@@ -46,16 +48,12 @@ public final class Tx3gDecoder extends SimpleSubtitleDecoder {
private static final String TAG = "Tx3gDecoder"; private static final String TAG = "Tx3gDecoder";
private static final char BOM_UTF16_BE = '\uFEFF';
private static final char BOM_UTF16_LE = '\uFFFE';
private static final int TYPE_STYL = 0x7374796c; private static final int TYPE_STYL = 0x7374796c;
private static final int TYPE_TBOX = 0x74626f78; private static final int TYPE_TBOX = 0x74626f78;
private static final String TX3G_SERIF = "Serif"; private static final String TX3G_SERIF = "Serif";
private static final int SIZE_ATOM_HEADER = 8; private static final int SIZE_ATOM_HEADER = 8;
private static final int SIZE_SHORT = 2; private static final int SIZE_SHORT = 2;
private static final int SIZE_BOM_UTF16 = 2;
private static final int SIZE_STYLE_RECORD = 12; private static final int SIZE_STYLE_RECORD = 12;
private static final int FONT_FACE_BOLD = 0x0001; private static final int FONT_FACE_BOLD = 0x0001;
...@@ -171,13 +169,11 @@ public final class Tx3gDecoder extends SimpleSubtitleDecoder { ...@@ -171,13 +169,11 @@ public final class Tx3gDecoder extends SimpleSubtitleDecoder {
if (textLength == 0) { if (textLength == 0) {
return ""; return "";
} }
if (parsableByteArray.bytesLeft() >= SIZE_BOM_UTF16) { int textStartPosition = parsableByteArray.getPosition();
char firstChar = parsableByteArray.peekChar(); @Nullable Charset charset = parsableByteArray.readUtfCharsetFromBom();
if (firstChar == BOM_UTF16_BE || firstChar == BOM_UTF16_LE) { int bomSize = parsableByteArray.getPosition() - textStartPosition;
return parsableByteArray.readString(textLength, Charsets.UTF_16); return parsableByteArray.readString(
} textLength - bomSize, charset != null ? charset : Charsets.UTF_8);
}
return parsableByteArray.readString(textLength, Charsets.UTF_8);
} }
private void applyStyleRecord(ParsableByteArray parsableByteArray, SpannableStringBuilder cueText) private void applyStyleRecord(ParsableByteArray parsableByteArray, SpannableStringBuilder cueText)
......
...@@ -40,6 +40,8 @@ public final class SubripDecoderTest { ...@@ -40,6 +40,8 @@ public final class SubripDecoderTest {
private static final String TYPICAL_NEGATIVE_TIMESTAMPS = private static final String TYPICAL_NEGATIVE_TIMESTAMPS =
"media/subrip/typical_negative_timestamps"; "media/subrip/typical_negative_timestamps";
private static final String TYPICAL_UNEXPECTED_END = "media/subrip/typical_unexpected_end"; private static final String TYPICAL_UNEXPECTED_END = "media/subrip/typical_unexpected_end";
private static final String TYPICAL_UTF16BE = "media/subrip/typical_utf16be";
private static final String TYPICAL_UTF16LE = "media/subrip/typical_utf16le";
private static final String TYPICAL_WITH_TAGS = "media/subrip/typical_with_tags"; private static final String TYPICAL_WITH_TAGS = "media/subrip/typical_with_tags";
private static final String TYPICAL_NO_HOURS_AND_MILLIS = private static final String TYPICAL_NO_HOURS_AND_MILLIS =
"media/subrip/typical_no_hours_and_millis"; "media/subrip/typical_no_hours_and_millis";
...@@ -149,6 +151,32 @@ public final class SubripDecoderTest { ...@@ -149,6 +151,32 @@ public final class SubripDecoderTest {
} }
@Test @Test
public void decodeTypicalUtf16LittleEndian() throws IOException {
SubripDecoder decoder = new SubripDecoder();
byte[] bytes =
TestUtil.getByteArray(ApplicationProvider.getApplicationContext(), TYPICAL_UTF16LE);
Subtitle subtitle = decoder.decode(bytes, bytes.length, false);
assertThat(subtitle.getEventTimeCount()).isEqualTo(6);
assertTypicalCue1(subtitle, 0);
assertTypicalCue2(subtitle, 2);
assertTypicalCue3(subtitle, 4);
}
@Test
public void decodeTypicalUtf16BigEndian() throws IOException {
SubripDecoder decoder = new SubripDecoder();
byte[] bytes =
TestUtil.getByteArray(ApplicationProvider.getApplicationContext(), TYPICAL_UTF16BE);
Subtitle subtitle = decoder.decode(bytes, bytes.length, false);
assertThat(subtitle.getEventTimeCount()).isEqualTo(6);
assertTypicalCue1(subtitle, 0);
assertTypicalCue2(subtitle, 2);
assertTypicalCue3(subtitle, 4);
}
@Test
public void decodeCueWithTag() throws IOException { public void decodeCueWithTag() throws IOException {
SubripDecoder decoder = new SubripDecoder(); SubripDecoder decoder = new SubripDecoder();
byte[] bytes = byte[] bytes =
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment