集册 Java实例教程 获取文件字符集

获取文件字符集

欢马劈雪     最近更新时间:2020-01-02 10:19:05

515
获取文件字符集

/**

 * baiweigang.cn 

 * Copyright (c) 1984-2014 All Rights Reserved.

 */

//package com.nowjava;

import java.io.BufferedInputStream;
/** from 
时代Java - N o w  J a v a . c o m**/


import java.io.File;

import java.io.FileInputStream;


import java.io.IOException;


public class Main {


    public static String getCharset(String pathName) {

        File file = new File(pathName);

        if (!file.exists()) {

            return "";

        }

        String charset = "GBK";

        byte[] first3Bytes = new byte[3];
        /** from 
        N o w  J a v a  .   c o m**/

        BufferedInputStream bis = null;

        try {

            boolean checked = false;

            bis = new BufferedInputStream(new FileInputStream(file));

            bis.mark(0);

            int read = bis.read(first3Bytes, 0, 3);

            if (read == -1)

                return charset;

            if (first3Bytes[0] == (byte) 0xFF

                    && first3Bytes[1] == (byte) 0xFE) {

                charset = "UTF-16LE";

                checked = true;

            } else if (first3Bytes[0] == (byte) 0xFE

                    && first3Bytes[1] == (byte) 0xFF) {

                charset = "UTF-16BE";

                checked = true;

            } else if (first3Bytes[0] == (byte) 0xEF

                    && first3Bytes[1] == (byte) 0xBB

                    && first3Bytes[2] == (byte) 0xBF) {

                charset = "UTF-8";

                checked = true;

            }

            bis.reset();

            if (!checked) {

                //    int len = 0;   

                //                int loc = 0;

                while ((read = bis.read()) != -1) {

                    //                    loc++;

                    if (read >= 0xF0)

                        break;

                    if (0x80 <= read && read <= 0xBF) // BF?GBK   

                        break;

                    if (0xC0 <= read && read <= 0xDF) {

                        read = bis.read();

                        if (0x80 <= read && read <= 0xBF) // ? (0xC0 - 0xDF) (0x80   

                                                          // - 0xBF),GB   

                            continue;

                        else

                            break;

                    } else if (0xE0 <= read && read <= 0xEF) {//    

                        read = bis.read();

                        if (0x80 <= read && read <= 0xBF) {

                            read = bis.read();

                            if (0x80 <= read && read <= 0xBF) {

                                charset = "UTF-8";

                                break;

                            } 
展开阅读全文