集册 Java实例教程 检查文本流并判断文本应使用何种编码类型。

检查文本流并判断文本应使用何种编码类型。

欢马劈雪     最近更新时间:2020-01-02 10:19:05

591
检查文本流,并判断该文本应使用哪种编码类型。

/*

 * Licensed to the Apache Software Foundation (ASF) under one

 * or more contributor license agreements.  See the NOTICE file

 * distributed with this work for additional information

 * regarding copyright ownership.  The ASF licenses this file

 * to you under the Apache License, Version 2.0 (the

 * "License"); you may not use this file except in compliance

 * with the License.  You may obtain a copy of the License at

 *

 *  http://www.apache.org/licenses/LICENSE-2.0

 *

 * Unless required by applicable law or agreed to in writing,

 * software distributed under the License is distributed on an

 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY

 * KIND, either express or implied.  See the License for the

 * specific language governing permissions and limitations

 * under the License.

 */

//package com.nowjava;

import java.io.BufferedInputStream;/*NowJava.com 提供*/

import java.io.InputStream;

import java.io.IOException;


public class Main {

    /**

     * Examine a stream of text and make a judgement on what encoding

     * type should be used for the text.  Ideally, we want to use 7bit

     * encoding to determine this, but we may need to use either quoted-printable

     * or base64.  The choice is made on the ratio of 7-bit characters to non-7bit.

     *

     * @param content     An input stream for the content we're examining.

     *

     * @exception IOException

     */

    public static String getTextTransferEncoding(InputStream content)

            throws IOException {


        // for efficiency, we'll read in blocks.

        BufferedInputStream in = new BufferedInputStream(content, 4096);


        int span = 0; // span of characters without a line break.

        boolean containsLongLines = false;

        int asciiChars = 0;

        int nonAsciiChars = 0;


        while (true) {//from n o w j a v a . c o m - 时代Java

            int ch = in.read();

            // if we hit an EOF here, go decide what type we've actually found.

            if (ch == -1) {

                break;

            }


            // we found a linebreak.  Reset the line length counters on either one.  We don't

            // really need to validate here.

            if (ch == '\n' || ch == '\r') {

                // hit a line end, reset our line length counter

                span = 0;

            } else {

                span++;

                // the text has long lines, we can't transfer this as unencoded text.

                if (span > 998) {

                    containsLongLines = true;

                }


                // non-ascii character, we have to transfer this in binary.

                if (!isAscii(ch)) {

                    nonAsciiChars++;

                } else {

                    asciiChars++;

                }

            }

        }


        // looking good so far, only valid chars here.

        if (nonAsciiChars == 0) {

            // does this contain long text lines?  We need to use a Q-P encoding which will

            // be only slightly longer, but handles folding the longer lines.

            if (containsLongLines) {

                return "quoted-printable";

            } else {

                // ideal!  Easiest one to handle.

                return "7bit";

            }

        } else {

            // mostly characters requiring encoding?  Base64 is our best bet.

            if (nonAsciiChars > asciiChars) {

                return "base64";

            } else {

                // Q-P encoding will use fewer bytes than the full Base64.

                return "quoted-printable";

            }

        }

    }


    /**

     * Examine a stream of text and make a judgement on what encoding

     * type should be used for the text.  Ideally, we want to use 7bit

     * encoding to determine this, but we may need to use either quoted-printable

     * or base64.  The choice is made on the ratio of 7-bit characters to non-7bit.

     *

     * @param content     A string for the content we're examining.

     */

    public static String getTextTransferEncoding(String content) {


        int asciiChars = 0;

        int nonAsciiChars = 0;


        for (int i = 0; i < content.length(); i++) {

            int ch = content.charAt(i);


            // non-ascii character, we have to transfer this in binary.

            if (!isAscii(ch)) {

                nonAsciiChars++;

            } else {

                asciiChars++;

            }

        }


        // looking good so far, only valid chars here.

        if (nonAsciiChars == 0) {

            // ideal!  Easiest one to handle.

            return "7bit";

        } else {

            // mostly characters requiring encoding?  Base64 is our best bet.

            if (nonAsciiChars > asciiChars) {

                return "base64";

            } else {

                // Q-P encoding will use fewer bytes than the full Base64.

                return "quoted-printable";

            }

        }

    }


    /**

     * Test to see if this string contains only US-ASCII (i.e., 7-bit

     * ASCII) Characters.

     *

     * @param s      The test string.

     *

     * @return true if this is a valid 7-bit ASCII encoding, false if it

     *         contains any non-US ASCII characters.

     */

    static public boolean isAscii(String s) {

        for (int i = 0; i < s.length(); i++) {

            
展开阅读全文