Commit 90e9807a by Lev

Fixes after review

parent 5609efd0
...@@ -27,6 +27,10 @@ import java.util.Arrays; ...@@ -27,6 +27,10 @@ import java.util.Arrays;
*/ */
public final class ParsableByteArray { public final class ParsableByteArray {
// UTF-16 BOM
public static final char BOM_UTF16_BE = '\uFEFF';
public static final char BOM_UTF16_LE = '\uFFFE';
private byte[] data; private byte[] data;
private int position; private int position;
// TODO(internal b/147657250): Enforce this limit on all read methods. // TODO(internal b/147657250): Enforce this limit on all read methods.
...@@ -153,6 +157,11 @@ public final class ParsableByteArray { ...@@ -153,6 +157,11 @@ public final class ParsableByteArray {
this.position = position; this.position = position;
} }
/** Resets the current byte offset. */
public void resetPosition() {
this.position = 0;
}
/** /**
* Returns the underlying array. * Returns the underlying array.
* *
...@@ -228,6 +237,11 @@ public final class ParsableByteArray { ...@@ -228,6 +237,11 @@ public final class ParsableByteArray {
return (char) ((data[position] & 0xFF) << 8 | (data[position + 1] & 0xFF)); return (char) ((data[position] & 0xFF) << 8 | (data[position + 1] & 0xFF));
} }
/** Peeks at the next char. */
public char peekLittleEndianChar() {
return (char) ((data[position] & 0xFF) | (data[position + 1] & 0xFF) << 8 );
}
/** Reads the next byte as an unsigned value. */ /** Reads the next byte as an unsigned value. */
public int readUnsignedByte() { public int readUnsignedByte() {
return (data[position++] & 0xFF); return (data[position++] & 0xFF);
...@@ -532,48 +546,67 @@ public final class ParsableByteArray { ...@@ -532,48 +546,67 @@ public final class ParsableByteArray {
} }
/** /**
* Reads a line of text. * Reads a line of text. Only UTF-8, UTF-16LE, UTF-16BE encoding supported.
* *
* <p>A line is considered to be terminated by any one of a carriage return ('\r'), a line feed * <p>A line is considered to be terminated by any one of a carriage return ('\r'), a line feed
* ('\n'), or a carriage return followed immediately by a line feed ('\r\n'). The UTF-16 charset * ('\n'), or a carriage return followed immediately by a line feed ('\r\n'). The UTF-16 charset
* is used. This method discards leading UTF-16 byte order marks (BOM), if present. * is used. This method discards leading UTF-16 byte order marks (BOM), if present.
* *
* @param isLittleEndian UTF-16 (LE) or UTF-16 (BE) encoding should be used * @param charset used encoding.
* @return The line not including any line-termination characters, or null if the end of the data * @return The line not including any line-termination characters, or null if the end of the data
* has already been reached. * has already been reached.
* @throws IllegalArgumentException if charset not supported.
*/ */
@Nullable @Nullable
public String readLineUtf16(boolean isLittleEndian) { public String readUtfLine(Charset charset) {
if(!charset.equals(Charsets.UTF_8)
&& !charset.equals(Charsets.UTF_16BE)
&& !charset.equals(Charsets.UTF_16LE)) {
throw new IllegalArgumentException("Only UTF-8, UTF-16LE, UTF-16BE encoding supported.");
}
if(charset.equals(Charsets.UTF_8)) {
return readLine();
}
if (bytesLeft() == 0) { if (bytesLeft() == 0) {
return null; return null;
} }
boolean isLittleEndian = charset.equals(Charsets.UTF_16LE);
int lineLimit = calculateLineLimitForUtf16(isLittleEndian); int lineLimit = calculateLineLimitForUtf16(isLittleEndian);
if (lineLimit - position >= 2 && isUtf16BOM(data[position], data[position + 1])) { if (lineLimit - position >= 2 && isUtf16BOM(peekChar())) {
// There's a UTF-16 byte order mark at the start of the line. Discard it. // There's a UTF-16 byte order mark at the start of the line. Discard it.
position += 2; position += 2;
} }
String line; String line = readString(lineLimit - position, charset);
if (isLittleEndian) {
line = Util.fromUtf16LEBytes(data, position, lineLimit - position);
} else {
line = Util.fromUtf16BEBytes(data, position, lineLimit - position);
}
position = lineLimit;
if (position == limit) { if (position == limit) {
return line; return line;
} }
if (isEqualsInUtf16(data[position], data[position + 1], '\r', isLittleEndian)) { char currentChar;
if(isLittleEndian) {
currentChar = peekLittleEndianChar();
} else {
currentChar = peekChar();
}
if (currentChar == '\r') {
position += 2; position += 2;
if (position == limit) { if (position == limit) {
return line; return line;
} }
} }
if (isEqualsInUtf16(data[position], data[position + 1], '\n', isLittleEndian)) {
if(isLittleEndian) {
currentChar = peekLittleEndianChar();
} else {
currentChar = peekChar();
}
if (currentChar == '\n') {
position += 2; position += 2;
} }
return line; return line;
...@@ -614,14 +647,8 @@ public final class ParsableByteArray { ...@@ -614,14 +647,8 @@ public final class ParsableByteArray {
return value; return value;
} }
private boolean isEqualsInUtf16(byte first, byte second, char value, boolean isLittleEndian) { private boolean isUtf16BOM(char character) {
return (isLittleEndian && (first | second << 8) == value) return character == BOM_UTF16_BE || character == BOM_UTF16_LE;
|| (!isLittleEndian && (first << 8 | second) == value);
}
private boolean isUtf16BOM(byte first, byte second) {
return (first == (byte) 0xFF && second == (byte) 0xFE)
|| (first == (byte) 0xFE && second == (byte) 0xFF);
} }
private int calculateLineLimitForUtf16(boolean isLittleEndian) { private int calculateLineLimitForUtf16(boolean isLittleEndian) {
......
...@@ -683,30 +683,6 @@ public final class Util { ...@@ -683,30 +683,6 @@ public final class Util {
} }
/** /**
* Returns a new {@link String} constructed by decoding UTF-16 (LE) encoded bytes in a subarray.
*
* @param bytes The UTF-16 encoded bytes to decode.
* @param offset The index of the first byte to decode.
* @param length The number of bytes to decode.
* @return The string.
*/
public static String fromUtf16LEBytes(byte[] bytes, int offset, int length) {
return new String(bytes, offset, length, Charsets.UTF_16LE);
}
/**
* Returns a new {@link String} constructed by decoding UTF-16 (BE) encoded bytes in a subarray.
*
* @param bytes The UTF-16 encoded bytes to decode.
* @param offset The index of the first byte to decode.
* @param length The number of bytes to decode.
* @return The string.
*/
public static String fromUtf16BEBytes(byte[] bytes, int offset, int length) {
return new String(bytes, offset, length, Charsets.UTF_16BE);
}
/**
* Returns a new byte array containing the code points of a {@link String} encoded using UTF-8. * Returns a new byte array containing the code points of a {@link String} encoded using UTF-8.
* *
* @param value The {@link String} whose bytes should be obtained. * @param value The {@link String} whose bytes should be obtained.
......
...@@ -72,30 +72,14 @@ public final class SubripDecoder extends SimpleSubtitleDecoder { ...@@ -72,30 +72,14 @@ public final class SubripDecoder extends SimpleSubtitleDecoder {
} }
@Override @Override
protected Subtitle decode(byte[] bytes, int length, boolean reset) { protected Subtitle decode(byte[] data, int length, boolean reset) {
ArrayList<Cue> cues = new ArrayList<>(); ArrayList<Cue> cues = new ArrayList<>();
LongArray cueTimesUs = new LongArray(); LongArray cueTimesUs = new LongArray();
ParsableByteArray subripData = new ParsableByteArray(bytes, length); ParsableByteArray subripData = new ParsableByteArray(data, length);
Charset charset = detectUtfCharset(subripData);
@Nullable Charset utf16Charset;
if (bytes.length >= 2) {
utf16Charset = getUtf16Charset(bytes[0], bytes[1]);
} else {
utf16Charset = null;
}
@Nullable String currentLine; @Nullable String currentLine;
while (true) { while ((currentLine = subripData.readUtfLine(charset)) != null) {
if (utf16Charset != null) {
currentLine = subripData.readLineUtf16(utf16Charset.equals(Charsets.UTF_16LE));
} else {
currentLine = subripData.readLine();
}
if (currentLine == null) {
break;
}
if (currentLine.length() == 0) { if (currentLine.length() == 0) {
// Skip blank lines. // Skip blank lines.
continue; continue;
...@@ -110,11 +94,7 @@ public final class SubripDecoder extends SimpleSubtitleDecoder { ...@@ -110,11 +94,7 @@ public final class SubripDecoder extends SimpleSubtitleDecoder {
} }
// Read and parse the timing line. // Read and parse the timing line.
if (utf16Charset != null) { currentLine = subripData.readUtfLine(charset);
currentLine = subripData.readLineUtf16(utf16Charset.equals(Charsets.UTF_16LE));
} else {
currentLine = subripData.readLine();
}
if (currentLine == null) { if (currentLine == null) {
Log.w(TAG, "Unexpected end"); Log.w(TAG, "Unexpected end");
break; break;
...@@ -132,21 +112,13 @@ public final class SubripDecoder extends SimpleSubtitleDecoder { ...@@ -132,21 +112,13 @@ public final class SubripDecoder extends SimpleSubtitleDecoder {
// Read and parse the text and tags. // Read and parse the text and tags.
textBuilder.setLength(0); textBuilder.setLength(0);
tags.clear(); tags.clear();
if (utf16Charset != null) { currentLine = subripData.readUtfLine(charset);
currentLine = subripData.readLineUtf16(utf16Charset.equals(Charsets.UTF_16LE));
} else {
currentLine = subripData.readLine();
}
while (!TextUtils.isEmpty(currentLine)) { while (!TextUtils.isEmpty(currentLine)) {
if (textBuilder.length() > 0) { if (textBuilder.length() > 0) {
textBuilder.append("<br>"); textBuilder.append("<br>");
} }
textBuilder.append(processLine(currentLine, tags)); textBuilder.append(processLine(currentLine, tags));
if (utf16Charset != null) { currentLine = subripData.readUtfLine(charset);
currentLine = subripData.readLineUtf16(utf16Charset.equals(Charsets.UTF_16LE));
} else {
currentLine = subripData.readLine();
}
} }
Spanned text = Html.fromHtml(textBuilder.toString()); Spanned text = Html.fromHtml(textBuilder.toString());
...@@ -169,19 +141,29 @@ public final class SubripDecoder extends SimpleSubtitleDecoder { ...@@ -169,19 +141,29 @@ public final class SubripDecoder extends SimpleSubtitleDecoder {
return new SubripSubtitle(cuesArray, cueTimesUsArray); return new SubripSubtitle(cuesArray, cueTimesUsArray);
} }
@Nullable /**
private Charset getUtf16Charset(byte first, byte second) { * Determine UTF encoding of the byte array. It can be UTF-16LE/UTF-16BE
if (first == (byte) 0xFE && second == (byte) 0xFF) { * if the byte array contains BOM, or UTF-8 otherwise as the default behavior.
// UTF-16 (BE) * After it resets the offset in ParsableByteArray
return Charsets.UTF_16BE; *
* @param data byte array to determinate UTF encoding.
* @return Determined encoding
*/
private Charset detectUtfCharset(ParsableByteArray data) {
if(data.limit() < 2) {
return Charsets.UTF_8;
} }
if (first == (byte) 0xFF && second == (byte) 0xFE) { char twoBytes = data.peekChar();
// UTF-16 (LE)
switch (twoBytes) {
case ParsableByteArray.BOM_UTF16_BE:
return Charsets.UTF_16BE;
case ParsableByteArray.BOM_UTF16_LE:
return Charsets.UTF_16LE; return Charsets.UTF_16LE;
default:
return Charsets.UTF_8;
} }
return null;
} }
/** /**
......
...@@ -46,9 +46,6 @@ public final class Tx3gDecoder extends SimpleSubtitleDecoder { ...@@ -46,9 +46,6 @@ public final class Tx3gDecoder extends SimpleSubtitleDecoder {
private static final String TAG = "Tx3gDecoder"; private static final String TAG = "Tx3gDecoder";
private static final char BOM_UTF16_BE = '\uFEFF';
private static final char BOM_UTF16_LE = '\uFFFE';
private static final int TYPE_STYL = 0x7374796c; private static final int TYPE_STYL = 0x7374796c;
private static final int TYPE_TBOX = 0x74626f78; private static final int TYPE_TBOX = 0x74626f78;
private static final String TX3G_SERIF = "Serif"; private static final String TX3G_SERIF = "Serif";
...@@ -173,7 +170,7 @@ public final class Tx3gDecoder extends SimpleSubtitleDecoder { ...@@ -173,7 +170,7 @@ public final class Tx3gDecoder extends SimpleSubtitleDecoder {
} }
if (parsableByteArray.bytesLeft() >= SIZE_BOM_UTF16) { if (parsableByteArray.bytesLeft() >= SIZE_BOM_UTF16) {
char firstChar = parsableByteArray.peekChar(); char firstChar = parsableByteArray.peekChar();
if (firstChar == BOM_UTF16_BE || firstChar == BOM_UTF16_LE) { if (firstChar == ParsableByteArray.BOM_UTF16_BE || firstChar == ParsableByteArray.BOM_UTF16_LE) {
return parsableByteArray.readString(textLength, Charsets.UTF_16); return parsableByteArray.readString(textLength, Charsets.UTF_16);
} }
} }
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment