Android--判断文本文件编码

原创

wx59bdec579ef96 2022-11-08 10:59:02 博主文章分类：Android ©著作权

©著作权归作者所有：来自51CTO博客作者wx59bdec579ef96的原创作品，请联系作者获取转载授权，否则将追究法律责任

方法1：利用windows文本文件编码特点。

windows下，Unicode、Unicode big endian和UTF-8编码的txt文件的开头会多出几个字节，分别是FF、FE（Unicode）,FE、FF（Unicode big endian）,EF、BB、BF（UTF-8）。

public static String getCharset(File file) {
        String charset = "GBK";
        byte[] first3Bytes = new byte[3];
        try {
            boolean checked = false;
            BufferedInputStream bis = new BufferedInputStream(
                  new FileInputStream(file));
            bis.mark(0);
            int read = bis.read(first3Bytes, 0, 3);
            if (read == -1)
                return charset;
            if (first3Bytes[0] == (byte) 0xFF && first3Bytes[1] == (byte) 0xFE) {
                charset = "UTF-16LE";
                checked = true;
            } else if (first3Bytes[0] == (byte) 0xFE && first3Bytes[1]
                == (byte) 0xFF) {
                charset = "UTF-16BE";
                checked = true;
            } else if (first3Bytes[0] == (byte) 0xEF && first3Bytes[1]
                    == (byte) 0xBB
                    && first3Bytes[2] == (byte) 0xBF) {
                charset = "UTF-8";
                checked = true;
            }
            bis.reset();
            if (!checked) {
                int loc = 0;
                while ((read = bis.read()) != -1) {
                    loc++;
                    if (read >= 0xF0)
                        break;
                    //单独出现BF以下的，也算是GBK
                    if (0x80 <= read && read <= 0xBF)
                        break;
                    if (0xC0 <= read && read <= 0xDF) {
                        read = bis.read();
                        if (0x80 <= read && read <= 0xBF)// 双字节 (0xC0 - 0xDF)
                            // (0x80 -
                            // 0xBF),也可能在GB编码内
                            continue;
                        else
                            break;
                     // 也有可能出错，但是几率较小
                    } else if (0xE0 <= read && read <= 0xEF) {
                        read = bis.read();
                        if (0x80 <= read && read <= 0xBF) {
                            read = bis.read();
                            if (0x80 <= read && read <= 0xBF) {
                                charset = "UTF-8";
                                break;
                            } else
                                break;
                        } else
                            break;
                    }
                }
                System.out.println(loc + " " + Integer.toHexString(read));
            }
            bis.close();
        } catch (Exception e) {
            e.printStackTrace();
        }
        return charset;
    }

缺点：不能这样去探测linux下的文件。

方法2：开源工程JCharDet

http://www.iteye.com/topic/266501

package org.mozilla.intl.chardet;

import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;

/**
 * 借助JCharDet获取文件字符集
 * @author icer
 * PS:
 * JCharDet 是mozilla自动字符集探测算法代码的java移植，其官方主页为：
 *      http://jchardet.sourceforge.net/
 * @date  2008/11/13 
 */
public class FileCharsetDetector {

  private boolean found = false;

  /**
   * 如果完全匹配某个字符集检测算法, 则该属性保存该字符集的名称. 否则(如二进制文件)其值就为默认值 null, 这时应当查询属性 
   */
  private String encoding = null;

  public static void main(String[] argv) throws Exception {
    if (argv.length != 1 && argv.length != 2) {

      System.out
          .println("Usage: FileCharsetDetector <path> [<languageHint>]");

      System.out.println("");
      System.out.println("Where <path> is d:/demo.txt");
      System.out.println("For optional <languageHint>. Use following...");
      System.out.println("    1 => Japanese");
      System.out.println("    2 => Chinese");
      System.out.println("    3 => Simplified Chinese");
      System.out.println("    4 => Traditional Chinese");
      System.out.println("    5 => Korean");
      System.out.println("    6 => Dont know (default)");

      return;
    } else {
      String encoding = null;
      if (argv.length == 2) {
        encoding = new FileCharsetDetector().guestFileEncoding(argv[0],
            Integer.valueOf(argv[1]));
      } else {
        encoding = new FileCharsetDetector().guestFileEncoding(argv[0]);
      }
      System.out.println("文件编码:" + encoding);
    }
  }

  /**
   * 传入一个文件(File)对象，检查文件编码
   * 
   * @param file
   *            File对象实例
   * @return 文件编码，若无，则返回null
   * @throws FileNotFoundException
   * @throws IOException
   */
  public String guestFileEncoding(File file) throws FileNotFoundException,
      IOException {
    return geestFileEncoding(file, new nsDetector());
  }

  /**
   * 获取文件的编码
   * 
   * @param file
   *            File对象实例
   * @param languageHint
   *            语言提示区域代码 eg：1 : Japanese; 2 : Chinese; 3 : Simplified Chinese;
   *            4 : Traditional Chinese; 5 : Korean; 6 : Dont know (default)
   * @return 文件编码，eg：UTF-8,GBK,GB2312形式，若无，则返回null
   * @throws FileNotFoundException
   * @throws IOException
   */
  public String guestFileEncoding(File file, int languageHint)
      throws FileNotFoundException, IOException {
    return geestFileEncoding(file, new nsDetector(languageHint));
  }

  /**
   * 获取文件的编码
   * 
   * @param path
   *            文件路径
   * @return 文件编码，eg：UTF-8,GBK,GB2312形式，若无，则返回null
   * @throws FileNotFoundException
   * @throws IOException
   */
  public String guestFileEncoding(String path) throws FileNotFoundException,
      IOException {
    return guestFileEncoding(new File(path));
  }

  /**
   * 获取文件的编码
   * 
   * @param path
   *            文件路径
   * @param languageHint
   *            语言提示区域代码 eg：1 : Japanese; 2 : Chinese; 3 : Simplified Chinese;
   *            4 : Traditional Chinese; 5 : Korean; 6 : Dont know (default)
   * @return
   * @throws FileNotFoundException
   * @throws IOException
   */
  public String guestFileEncoding(String path, int languageHint)
      throws FileNotFoundException, IOException {
    return guestFileEncoding(new File(path), languageHint);
  }

  /**
   * 获取文件的编码
   * 
   * @param file
   * @param det
   * @return
   * @throws FileNotFoundException
   * @throws IOException
   */
  private String geestFileEncoding(File file, nsDetector det)
      throws FileNotFoundException, IOException {
    // Set an observer...
    // The Notify() will be called when a matching charset is found.
    det.Init(new nsICharsetDetectionObserver() {
      public void Notify(String charset) {
        found = true;
        encoding = charset;
      }
    });

    BufferedInputStream imp = new BufferedInputStream(new FileInputStream(
        file));

    byte[] buf = new byte[1024];
    int len;
    boolean done = false;
    boolean isAscii = true;

    while ((len = imp.read(buf, 0, buf.length)) != -1) {
      // Check if the stream is only ascii.
      if (isAscii)
        isAscii = det.isAscii(buf, len);

      // DoIt if non-ascii and not done yet.
      if (!isAscii && !done)
        done = det.DoIt(buf, len, false);
    }
    det.DataEnd();

    if (isAscii) {
      encoding = "ASCII";
      found = true;
    }

    if (!found) {
      String prob[] = det.getProbableCharsets();
      if (prob.length > 0) {
        // 在没有发现情况下，则取第一个可能的编码
        encoding = prob[0];
      } else {
        return null;
      }
    }
    return encoding;
  }
}

方法3：开源工程juniversalcharde

http://code.google.com/p/juniversalchardet/

public static String getFileIncode(File file) {

    if (!file.exists()) {
      System.err.println("getFileIncode: file not exists!");
      return null;
    }

    byte[] buf = new byte[4096];
    FileInputStream fis = null;
    try {
      fis = new FileInputStream(file);
      // (1)
      UniversalDetector detector = new UniversalDetector(null);

      // (2)
      int nread;
      while ((nread = fis.read(buf)) > 0 && !detector.isDone()) {
        detector.handleData(buf, 0, nread);
      }
      // (3)
      detector.dataEnd();

      // (4)
      String encoding = detector.getDetectedCharset();
      if (encoding != null) {
        System.out.println("Detected encoding = " + encoding);
      } else {
        System.out.println("No encoding detected.");
      }

      // (5)
      detector.reset();
      fis.close();
      return encoding;
    } catch (Exception e) {
      e.printStackTrace();
    }

    return null;
  }