集册 Java实例教程 用UTF读取字符串

用UTF读取字符串

欢马劈雪     最近更新时间:2020-01-02 10:19:05

463
从数据流中完全读取UTF-8编码的字符串,然后返回该字符串。
/*
 from nowjava.com 
*/

/*

 Written in 2013 by Peter O.

 Any copyright is dedicated to the Public Domain.

 http://creativecommons.org/publicdomain/zero/1.0/

 If you like this, you should donate to Peter O.

 at: http://upokecenter.dreamhosters.com/articles/donate-now-2/

 */

import java.io.*;


public class Main{

    /**

     * Reads a string in UTF-8 encoding from a data stream in full and returns that

     * string. Replaces invalid encoding with the replacement character (U +

     * FFFD).

     * @param stream A readable data stream.

     * @return The string read.

     * @throws java.io.IOException An I/O error occurred.

     * @throws NullPointerException The parameter {@code stream} is null.

     */

    public static String ReadUtf8ToString(InputStream stream)

            throws java.io.IOException {

        return ReadUtf8ToString(stream, -1, true);

    }

    /**

     * Reads a string in UTF-8 encoding from a data stream and returns that string.

     * @param stream A readable data stream.

     * @param bytesCount The length, in bytes, of the string. If this is less than

     * 0, this function will read until the end of the stream.

     * @param replace If true, replaces invalid encoding with the replacement

     * character (U + FFFD). If false, throws an error if an unpaired

     * surrogate code point is seen.

     * @return The string read.

     * @throws java.io.IOException An I/O error occurred; or, the string is not

     * valid UTF-8 and {@code replace} is false.

     * @throws NullPointerException The parameter {@code stream} is null.

     */

    public static String ReadUtf8ToString(InputStream stream,

            int bytesCount, boolean replace) throws java.io.IOException {

        StringBuilder builder = new StringBuilder();

        int retval = DataUtilities.ReadUtf8(stream, bytesCount, builder,

                replace);

        if (retval == -1) {

            throw new IOException("Unpaired surrogate code point found.",

                    new java.nio.charset.MalformedInputException(1));
                    /** 
                     来自 时   代     Java  公  众  号 - nowjava.com**/

        }

        return builder.toString();

    }

    /**

     * Reads a string in UTF-8 encoding from a data stream.

     * @param stream A readable data stream.

     * @param bytesCount The length, in bytes, of the string. If this is less than

     * 0, this function will read until the end of the stream.

     * @param builder A string builder object where the resulting string will be

     * stored.

     * @param replace If true, replaces invalid encoding with the replacement

     * character (U + FFFD). If false, stops processing when an unpaired

     * surrogate code point is seen.

     * @return 0 if the entire string was read without errors, -1 if the string is

     * not valid UTF-8 and {@code replace} is false, or -2 if the end of the

     * stream was reached before the last character was read completely

     * (which is only the case if {@code bytesCount} is 0 or greater).

     * @throws java.io.IOException An I/O error occurred.

     * @throws NullPointerException The parameter {@code stream} is null or {@code

     * builder} is null.

     */

    public static int ReadUtf8(InputStream stream, int bytesCount,

            StringBuilder builder, boolean replace)

            throws java.io.IOException {

        if (stream == null) {

            throw new NullPointerException("stream");

        }

        if (builder == null) {

            throw new NullPointerException("builder");

        }

        int cp = 0;

        int bytesSeen = 0;

        int bytesNeeded = 0;

        int lower = 0x80;

        int upper = 0xbf;

        int pointer = 0;

        while (pointer < bytesCount || bytesCount < 0) {

            int b = stream.read();

            if (b < 0) {

                if (bytesNeeded != 0) {

                    bytesNeeded = 0;

                    if (replace) {

                        builder.append((char) 0xfffd);

                        if (bytesCount >= 0) {

                            return -2;

                        }

                        break; // end of stream

                    }

                    return -1;

                }

                if (bytesCount >= 0) {

                    return -2;

                }

                break; // end of stream

            }

            if (bytesCount > 0) {

                ++pointer;

            }

            if (bytesNeeded == 0) {

                if ((b & 0x7f) == b) {

                    builder.append((char) b);

                } else if (b >= 0xc2 && b <= 0xdf) {

                    bytesNeeded = 1;

                    cp = (b - 0xc0) << 6;

                } else if (b >= 0xe0 && b <= 0xef) {

                    lower = (b == 0xe0) ? 0xa0 : 0x80;

                    upper = (b == 0xed) ? 0x9f : 0xbf;

                    bytesNeeded = 2;

                    cp = (b - 0xe0) << 12;

                } else if (b >= 0xf0 && b <= 0xf4) {

                    lower = (b == 0xf0) ? 0x90 : 0x80;

                    upper = (b == 0xf4) ? 0x8f : 0xbf;

                    bytesNeeded = 3;

                    cp = (b - 0xf0) << 18;

                } else {

                    if (replace) {

                        builder.append((char) 0xfffd);

                    } else {

                        return -1;

                    }

                }

                continue;

            }

            if (b < lower || b > upper) {

                cp = bytesNeeded = bytesSeen = 0;

                lower = 0x80;

                upper = 0xbf;

                if (replace) {

                    builder.append((char) 0xfffd);

                    // "Read" the last byte again

                    if (b < 0x80) {

                        builder.append((char) b);

                    } else if (b >= 0xc2 && b <= 0xdf) {

                        bytesNeeded = 1;

                        cp = (b - 0xc0) << 6;

                    } else if (b >= 0xe0 && b <= 0xef) {

                        lower = (b == 0xe0) ? 0xa0 : 0x80;

                        upper = (b == 0xed) ? 0x9f : 0xbf;

                        bytesNeeded = 2;

                        cp = (b - 0xe0) << 12;

                    } else if (b >= 0xf0 && b <= 0xf4) {

                        lower = (b == 0xf0) ? 0x90 : 0x80;

                        upper = (b == 0xf4) ? 0x8f : 0xbf;

                        bytesNeeded = 3;

                        cp = (b - 0xf0) << 18;

                    } else {

                        builder.append((char) 0xfffd);

                
展开阅读全文