package com.java110.code.util;
|
|
/**
|
version: 1.1 / 2007-01-25
|
- changed BOM recognition ordering (longer boms first)
|
|
|
Original pseudocode : Thomas Weidenfeller
|
Implementation tweaked: Aki Nieminen
|
|
|
http://www.unicode.org/unicode/faq/utf_bom.html
|
BOMs:
|
00 00 FE FF = UTF-32, big-endian
|
FF FE 00 00 = UTF-32, little-endian
|
EF BB BF = UTF-8,
|
FE FF = UTF-16, big-endian
|
FF FE = UTF-16, little-endian
|
|
|
Win2k Notepad:
|
Unicode format = UTF-16LE
|
***/
|
|
|
import java.io.*;
|
|
|
/**
|
* Generic unicode textreader, which will use BOM mark
|
* to identify the encoding to be used. If BOM is not found
|
* then use a given default or system encoding.
|
*/
|
public class UnicodeReader extends Reader {
|
PushbackInputStream internalIn;
|
InputStreamReader internalIn2 = null;
|
String defaultEnc;
|
|
|
private static final int BOM_SIZE = 4;
|
|
|
/**
|
*
|
* @param in inputstream to be read
|
* @param defaultEnc default encoding if stream does not have
|
* BOM marker. Give NULL to use system-level default.
|
*/
|
public UnicodeReader(InputStream in, String defaultEnc) {
|
internalIn = new PushbackInputStream(in, BOM_SIZE);
|
this.defaultEnc = defaultEnc;
|
}
|
|
|
public String getDefaultEncoding() {
|
return defaultEnc;
|
}
|
|
|
/**
|
* Get stream encoding or NULL if stream is uninitialized.
|
* Call init() or read() method to initialize it.
|
*/
|
public String getEncoding() {
|
if (internalIn2 == null) return null;
|
return internalIn2.getEncoding();
|
}
|
|
|
/**
|
* Read-ahead four bytes and check for BOM marks. Extra bytes are
|
* unread back to the stream, only BOM bytes are skipped.
|
*/
|
protected void init() throws IOException {
|
if (internalIn2 != null) return;
|
|
|
String encoding;
|
byte bom[] = new byte[BOM_SIZE];
|
int n, unread;
|
n = internalIn.read(bom, 0, bom.length);
|
|
|
if ( (bom[0] == (byte)0x00) && (bom[1] == (byte)0x00) &&
|
(bom[2] == (byte)0xFE) && (bom[3] == (byte)0xFF) ) {
|
encoding = "UTF-32BE";
|
unread = n - 4;
|
} else if ( (bom[0] == (byte)0xFF) && (bom[1] == (byte)0xFE) &&
|
(bom[2] == (byte)0x00) && (bom[3] == (byte)0x00) ) {
|
encoding = "UTF-32LE";
|
unread = n - 4;
|
} else if ( (bom[0] == (byte)0xEF) && (bom[1] == (byte)0xBB) &&
|
(bom[2] == (byte)0xBF) ) {
|
encoding = "UTF-8";
|
unread = n - 3;
|
} else if ( (bom[0] == (byte)0xFE) && (bom[1] == (byte)0xFF) ) {
|
encoding = "UTF-16BE";
|
unread = n - 2;
|
} else if ( (bom[0] == (byte)0xFF) && (bom[1] == (byte)0xFE) ) {
|
encoding = "UTF-16LE";
|
unread = n - 2;
|
} else {
|
// Unicode BOM mark not found, unread all bytes
|
encoding = defaultEnc;
|
unread = n;
|
}
|
//System.out.println("read=" + n + ", unread=" + unread);
|
|
|
if (unread > 0) internalIn.unread(bom, (n - unread), unread);
|
|
|
// Use given encoding
|
if (encoding == null) {
|
internalIn2 = new InputStreamReader(internalIn);
|
} else {
|
internalIn2 = new InputStreamReader(internalIn, encoding);
|
}
|
}
|
|
|
public void close() throws IOException {
|
init();
|
internalIn2.close();
|
}
|
|
|
public int read(char[] cbuf, int off, int len) throws IOException {
|
init();
|
return internalIn2.read(cbuf, off, len);
|
}
|
|
|
}
|