Added UTF-16 (LE) and UTF-16 (BE) support for subrip subtitles.

5609efd0 · Lev · ab4d37f4 · 5609efd0 · 5609efd0 · 5609efd0
Commit 5609efd0 authored Nov 05, 2022 by Lev
Showing with 174 additions and 1 deletions
library/common/src/main/java/com/google/android/exoplayer2/util/ParsableByteArray.java
library/common/src/main/java/com/google/android/exoplayer2/util/Util.java
library/extractor/src/main/java/com/google/android/exoplayer2/text/subrip/SubripDecoder.java
library/extractor/src/test/java/com/google/android/exoplayer2/text/subrip/SubripDecoderTest.java
testdata/src/test/assets/media/subrip/typical_utf16be
testdata/src/test/assets/media/subrip/typical_utf16le
--- a/library/common/src/main/java/com/google/android/exoplayer2/util/ParsableByteArray.java
+++ b/library/common/src/main/java/com/google/android/exoplayer2/util/ParsableByteArray.java
@@ -532,6 +532,54 @@ public final class ParsableByteArray {
  }
  /**
+   * Reads a line of text.
+   *
+   * <p>A line is considered to be terminated by any one of a carriage return ('\r'), a line feed
+   * ('\n'), or a carriage return followed immediately by a line feed ('\r\n'). The UTF-16 charset
+   * is used. This method discards leading UTF-16 byte order marks (BOM), if present.
+   *
+   * @param isLittleEndian UTF-16 (LE) or UTF-16 (BE) encoding should be used
+   * @return The line not including any line-termination characters, or null if the end of the data
+   * has already been reached.
+   */
+  @Nullable
+  public String readLineUtf16(boolean isLittleEndian) {
+    if (bytesLeft() == 0) {
+      return null;
+    }
+    int lineLimit = calculateLineLimitForUtf16(isLittleEndian);
+    if (lineLimit - position >= 2 && isUtf16BOM(data[position], data[position + 1])) {
+      // There's a UTF-16 byte order mark at the start of the line. Discard it.
+      position += 2;
+    }
+    String line;
+    if (isLittleEndian) {
+      line = Util.fromUtf16LEBytes(data, position, lineLimit - position);
+    } else {
+      line = Util.fromUtf16BEBytes(data, position, lineLimit - position);
+    }
+    position = lineLimit;
+    if (position == limit) {
+      return line;
+    }
+    if (isEqualsInUtf16(data[position], data[position + 1], '\r', isLittleEndian)) {
+      position += 2;
+      if (position == limit) {
+        return line;
+      }
+    }
+    if (isEqualsInUtf16(data[position], data[position + 1], '\n', isLittleEndian)) {
+      position += 2;
+    }
+    return line;
+  }
+  /**
   * Reads a long value encoded by UTF-8 encoding
   *
   * @throws NumberFormatException if there is a problem with decoding
@@ -565,4 +613,29 @@ public final class ParsableByteArray {
    position += length;
    return value;
  }
+  private boolean isEqualsInUtf16(byte first, byte second, char value, boolean isLittleEndian) {
+    return (isLittleEndian && (first | second << 8) == value)
+        || (!isLittleEndian && (first << 8 | second) == value);
+  }
+  private boolean isUtf16BOM(byte first, byte second) {
+    return (first == (byte) 0xFF && second == (byte) 0xFE)
+        || (first == (byte) 0xFE && second == (byte) 0xFF);
+  }
+  private int calculateLineLimitForUtf16(boolean isLittleEndian) {
+    int lineLimit = position;
+    while (lineLimit < limit - 1) {
+      if (isLittleEndian && Util.isLinebreak(data[lineLimit] | data[lineLimit + 1] << 8)) {
+        break;
+      } else if (!isLittleEndian && Util.isLinebreak(data[lineLimit] << 8 | data[lineLimit + 1])) {
+        break;
+      }
+      lineLimit += 2;
+    }
+    return lineLimit;
+  }
 }
--- a/library/common/src/main/java/com/google/android/exoplayer2/util/Util.java
+++ b/library/common/src/main/java/com/google/android/exoplayer2/util/Util.java
@@ -683,6 +683,30 @@ public final class Util {
  }
  /**
+   * Returns a new {@link String} constructed by decoding UTF-16 (LE) encoded bytes in a subarray.
+   *
+   * @param bytes  The UTF-16 encoded bytes to decode.
+   * @param offset The index of the first byte to decode.
+   * @param length The number of bytes to decode.
+   * @return The string.
+   */
+  public static String fromUtf16LEBytes(byte[] bytes, int offset, int length) {
+    return new String(bytes, offset, length, Charsets.UTF_16LE);
+  }
+  /**
+   * Returns a new {@link String} constructed by decoding UTF-16 (BE) encoded bytes in a subarray.
+   *
+   * @param bytes  The UTF-16 encoded bytes to decode.
+   * @param offset The index of the first byte to decode.
+   * @param length The number of bytes to decode.
+   * @return The string.
+   */
+  public static String fromUtf16BEBytes(byte[] bytes, int offset, int length) {
+    return new String(bytes, offset, length, Charsets.UTF_16BE);
+  }
+  /**
   * Returns a new byte array containing the code points of a {@link String} encoded using UTF-8.
   *
   * @param value The {@link String} whose bytes should be obtained.

--- a/library/extractor/src/main/java/com/google/android/exoplayer2/text/subrip/SubripDecoder.java
+++ b/library/extractor/src/main/java/com/google/android/exoplayer2/text/subrip/SubripDecoder.java
@@ -26,6 +26,8 @@ import com.google.android.exoplayer2.util.Assertions;
 import com.google.android.exoplayer2.util.Log;
 import com.google.android.exoplayer2.util.LongArray;
 import com.google.android.exoplayer2.util.ParsableByteArray;
+import com.google.common.base.Charsets;
+import java.nio.charset.Charset;
 import java.util.ArrayList;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
@@ -75,8 +77,25 @@ public final class SubripDecoder extends SimpleSubtitleDecoder {
    LongArray cueTimesUs = new LongArray();
    ParsableByteArray subripData = new ParsableByteArray(bytes, length);
+    @Nullable Charset utf16Charset;
+    if (bytes.length >= 2) {
+      utf16Charset = getUtf16Charset(bytes[0], bytes[1]);
+    } else {
+      utf16Charset = null;
+    }
    @Nullable String currentLine;
-    while ((currentLine = subripData.readLine()) != null) {
+    while (true) {
+      if (utf16Charset != null) {
+        currentLine = subripData.readLineUtf16(utf16Charset.equals(Charsets.UTF_16LE));
+      } else {
+        currentLine = subripData.readLine();
+      }
+      if (currentLine == null) {
+        break;
+      }
      if (currentLine.length() == 0) {
        // Skip blank lines.
        continue;
@@ -91,7 +110,11 @@ public final class SubripDecoder extends SimpleSubtitleDecoder {
      }
      // Read and parse the timing line.
+      if (utf16Charset != null) {
+        currentLine = subripData.readLineUtf16(utf16Charset.equals(Charsets.UTF_16LE));
+      } else {
        currentLine = subripData.readLine();
+      }
      if (currentLine == null) {
        Log.w(TAG, "Unexpected end");
        break;
@@ -109,14 +132,22 @@ public final class SubripDecoder extends SimpleSubtitleDecoder {
      // Read and parse the text and tags.
      textBuilder.setLength(0);
      tags.clear();
+      if (utf16Charset != null) {
+        currentLine = subripData.readLineUtf16(utf16Charset.equals(Charsets.UTF_16LE));
+      } else {
        currentLine = subripData.readLine();
+      }
      while (!TextUtils.isEmpty(currentLine)) {
        if (textBuilder.length() > 0) {
          textBuilder.append("<br>");
        }
        textBuilder.append(processLine(currentLine, tags));
+        if (utf16Charset != null) {
+          currentLine = subripData.readLineUtf16(utf16Charset.equals(Charsets.UTF_16LE));
+        } else {
          currentLine = subripData.readLine();
        }
+      }
      Spanned text = Html.fromHtml(textBuilder.toString());
@@ -138,6 +169,21 @@ public final class SubripDecoder extends SimpleSubtitleDecoder {
    return new SubripSubtitle(cuesArray, cueTimesUsArray);
  }
+  @Nullable
+  private Charset getUtf16Charset(byte first, byte second) {
+    if (first == (byte) 0xFE && second == (byte) 0xFF) {
+      // UTF-16 (BE)
+      return Charsets.UTF_16BE;
+    }
+    if (first == (byte) 0xFF && second == (byte) 0xFE) {
+      // UTF-16 (LE)
+      return Charsets.UTF_16LE;
+    }
+    return null;
+  }
  /**
   * Trims and removes tags from the given line. The removed tags are added to {@code tags}.
   *

--- a/library/extractor/src/test/java/com/google/android/exoplayer2/text/subrip/SubripDecoderTest.java
+++ b/library/extractor/src/test/java/com/google/android/exoplayer2/text/subrip/SubripDecoderTest.java
@@ -40,6 +40,8 @@ public final class SubripDecoderTest {
  private static final String TYPICAL_NEGATIVE_TIMESTAMPS =
      "media/subrip/typical_negative_timestamps";
  private static final String TYPICAL_UNEXPECTED_END = "media/subrip/typical_unexpected_end";
+  private static final String TYPICAL_UTF16BE = "media/subrip/typical_utf16be";
+  private static final String TYPICAL_UTF16LE = "media/subrip/typical_utf16le";
  private static final String TYPICAL_WITH_TAGS = "media/subrip/typical_with_tags";
  private static final String TYPICAL_NO_HOURS_AND_MILLIS =
      "media/subrip/typical_no_hours_and_millis";
@@ -81,6 +83,34 @@ public final class SubripDecoderTest {
  }
  @Test
+  public void decodeTypicalUtf16LE() throws IOException {
+    SubripDecoder decoder = new SubripDecoder();
+    byte[] bytes =
+        TestUtil.getByteArray(
+            ApplicationProvider.getApplicationContext(), TYPICAL_UTF16LE);
+    Subtitle subtitle = decoder.decode(bytes, bytes.length, false);
+    assertThat(subtitle.getEventTimeCount()).isEqualTo(6);
+    assertTypicalCue1(subtitle, 0);
+    assertTypicalCue2(subtitle, 2);
+    assertTypicalCue3(subtitle, 4);
+  }
+  @Test
+  public void decodeTypicalUtf16BE() throws IOException {
+    SubripDecoder decoder = new SubripDecoder();
+    byte[] bytes =
+        TestUtil.getByteArray(
+            ApplicationProvider.getApplicationContext(), TYPICAL_UTF16BE);
+    Subtitle subtitle = decoder.decode(bytes, bytes.length, false);
+    assertThat(subtitle.getEventTimeCount()).isEqualTo(6);
+    assertTypicalCue1(subtitle, 0);
+    assertTypicalCue2(subtitle, 2);
+    assertTypicalCue3(subtitle, 4);
+  }
+  @Test
  public void decodeTypicalExtraBlankLine() throws IOException {
    SubripDecoder decoder = new SubripDecoder();
    byte[] bytes =

--- a/testdata/src/test/assets/media/subrip/typical_utf16be
+++ b/testdata/src/test/assets/media/subrip/typical_utf16be
--- a/testdata/src/test/assets/media/subrip/typical_utf16le
+++ b/testdata/src/test/assets/media/subrip/typical_utf16le