JDK8/Java8源码在线阅读

JDK8/Java8源码在线阅读 / sun / text / normalizer / NormalizerBase.java
/*
 * Copyright (c) 2005, 2011, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License version 2 only, as
 * published by the Free Software Foundation.  Oracle designates this
 * particular file as subject to the "Classpath" exception as provided
 * by Oracle in the LICENSE file that accompanied this code.
 *
 * This code is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 * version 2 for more details (a copy is included in the LICENSE file that
 * accompanied this code).
 *
 * You should have received a copy of the GNU General Public License version
 * 2 along with this work; if not, write to the Free Software Foundation,
 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
 *
 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
 * or visit www.oracle.com if you need additional information or have any
 * questions.
 */
/*
 *******************************************************************************
 * (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved         *
 *                                                                             *
 * The original version of this source code and documentation is copyrighted   *
 * and owned by IBM, These materials are provided under terms of a License     *
 * Agreement between IBM and Sun. This technology is protected by multiple     *
 * US and International patents. This notice and attribution to IBM may not    *
 * to removed.                                                                 *
 *******************************************************************************
 */

package sun.text.normalizer;

import java.text.CharacterIterator;
import java.text.Normalizer;

/**
 * Unicode Normalization
 *
 * <h2>Unicode normalization API</h2>
 *
 * <code>normalize</code> transforms Unicode text into an equivalent composed or
 * decomposed form, allowing for easier sorting and searching of text.
 * <code>normalize</code> supports the standard normalization forms described in
 * <a href="http://www.unicode.org/unicode/reports/tr15/" target="unicode">
 * Unicode Standard Annex #15 &mdash; Unicode Normalization Forms</a>.
 *
 * Characters with accents or other adornments can be encoded in
 * several different ways in Unicode.  For example, take the character A-acute.
 * In Unicode, this can be encoded as a single character (the
 * "composed" form):
 *
 * <p>
 *      00C1    LATIN CAPITAL LETTER A WITH ACUTE
 * </p>
 *
 * or as two separate characters (the "decomposed" form):
 *
 * <p>
 *      0041    LATIN CAPITAL LETTER A
 *      0301    COMBINING ACUTE ACCENT
 * </p>
 *
 * To a user of your program, however, both of these sequences should be
 * treated as the same "user-level" character "A with acute accent".  When you
 * are searching or comparing text, you must ensure that these two sequences are
 * treated equivalently.  In addition, you must handle characters with more than
 * one accent.  Sometimes the order of a character's combining accents is
 * significant, while in other cases accent sequences in different orders are
 * really equivalent.
 *
 * Similarly, the string "ffi" can be encoded as three separate letters:
 *
 * <p>
 *      0066    LATIN SMALL LETTER F
 *      0066    LATIN SMALL LETTER F
 *      0069    LATIN SMALL LETTER I
 * </p>
 *
 * or as the single character
 *
 * <p>
 *      FB03    LATIN SMALL LIGATURE FFI
 * </p>
 *
 * The ffi ligature is not a distinct semantic character, and strictly speaking
 * it shouldn't be in Unicode at all, but it was included for compatibility
 * with existing character sets that already provided it.  The Unicode standard
 * identifies such characters by giving them "compatibility" decompositions
 * into the corresponding semantic characters.  When sorting and searching, you
 * will often want to use these mappings.
 *
 * <code>normalize</code> helps solve these problems by transforming text into
 * the canonical composed and decomposed forms as shown in the first example
 * above. In addition, you can have it perform compatibility decompositions so
 * that you can treat compatibility characters the same as their equivalents.
 * Finally, <code>normalize</code> rearranges accents into the proper canonical
 * order, so that you do not have to worry about accent rearrangement on your
 * own.
 *
 * Form FCD, "Fast C or D", is also designed for collation.
 * It allows to work on strings that are not necessarily normalized
 * with an algorithm (like in collation) that works under "canonical closure",
 * i.e., it treats precomposed characters and their decomposed equivalents the
 * same.
 *
 * It is not a normalization form because it does not provide for uniqueness of
 * representation. Multiple strings may be canonically equivalent (their NFDs
 * are identical) and may all conform to FCD without being identical themselves.
 *
 * The form is defined such that the "raw decomposition", the recursive
 * canonical decomposition of each character, results in a string that is
 * canonically ordered. This means that precomposed characters are allowed for
 * as long as their decompositions do not need canonical reordering.
 *
 * Its advantage for a process like collation is that all NFD and most NFC texts
 * - and many unnormalized texts - already conform to FCD and do not need to be
 * normalized (NFD) for such a process. The FCD quick check will return YES for
 * most strings in practice.
 *
 * normalize(FCD) may be implemented with NFD.
 *
 * For more details on FCD see the collation design document:
 * http://source.icu-project.org/repos/icu/icuhtml/trunk/design/collation/ICU_collation_design.htm
 *
 * ICU collation performs either NFD or FCD normalization automatically if
 * normalization is turned on for the collator object. Beyond collation and
 * string search, normalized strings may be useful for string equivalence
 * comparisons, transliteration/transcription, unique representations, etc.
 *
 * The W3C generally recommends to exchange texts in NFC.
 * Note also that most legacy character encodings use only precomposed forms and
 * often do not encode any combining marks by themselves. For conversion to such
 * character encodings the Unicode text needs to be normalized to NFC.
 * For more usage examples, see the Unicode Standard Annex.
 * @stable ICU 2.8
 */

public final class NormalizerBase implements Cloneable {

    //-------------------------------------------------------------------------
    // Private data
    //-------------------------------------------------------------------------
    private char[] buffer = new char[100];
    private int bufferStart = 0;
    private int bufferPos   = 0;
    private int bufferLimit = 0;

    // The input text and our position in it
    private UCharacterIterator  text;
    private Mode                mode = NFC;
    private int                 options = 0;
    private int                 currentIndex;
    private int                 nextIndex;

    /**
     * Options bit set value to select Unicode 3.2 normalization
     * (except NormalizationCorrections).
     * At most one Unicode version can be selected at a time.
     * @stable ICU 2.6
     */
    public static final int UNICODE_3_2=0x20;

    /**
     * Constant indicating that the end of the iteration has been reached.
     * This is guaranteed to have the same value as {@link UCharacterIterator#DONE}.
     * @stable ICU 2.8
     */
    public static final int DONE = UCharacterIterator.DONE;

    /**
     * Constants for normalization modes.
     * @stable ICU 2.8
     */
    public static class Mode {
        private int modeValue;
        private Mode(int value) {
            modeValue = value;
        }

        /**
         * This method is used for method dispatch
         * @stable ICU 2.6
         */
        protected int normalize(char[] src, int srcStart, int srcLimit,
                                char[] dest,int destStart,int destLimit,
                                UnicodeSet nx) {
            int srcLen = (srcLimit - srcStart);
            int destLen = (destLimit - destStart);
            if( srcLen > destLen ) {
                return srcLen;
            }
            System.arraycopy(src,srcStart,dest,destStart,srcLen);
            return srcLen;
        }

        /**
         * This method is used for method dispatch
         * @stable ICU 2.6
         */
        protected int normalize(char[] src, int srcStart, int srcLimit,
                                char[] dest,int destStart,int destLimit,
                                int options) {
            return normalize(   src, srcStart, srcLimit,
                                dest,destStart,destLimit,
                                NormalizerImpl.getNX(options)
                                );
        }

        /**
         * This method is used for method dispatch
         * @stable ICU 2.6
         */
        protected String normalize(String src, int options) {
            return src;
        }

        /**
         * This method is used for method dispatch
         * @stable ICU 2.8
         */
        protected int getMinC() {
            return -1;
        }

        /**
         * This method is used for method dispatch
         * @stable ICU 2.8
         */
        protected int getMask() {
            return -1;
        }

        /**
         * This method is used for method dispatch
         * @stable ICU 2.8
         */
        protected IsPrevBoundary getPrevBoundary() {
            return null;
        }

        /**
         * This method is used for method dispatch
         * @stable ICU 2.8
         */
        protected IsNextBoundary getNextBoundary() {
            return null;
        }

        /**
         * This method is used for method dispatch
         * @stable ICU 2.6
         */
        protected QuickCheckResult quickCheck(char[] src,int start, int limit,
                                              boolean allowMaybe,UnicodeSet nx) {
            if(allowMaybe) {
                return MAYBE;
            }
            return NO;
        }

        /**
         * This method is used for method dispatch
         * @stable ICU 2.8
         */
        protected boolean isNFSkippable(int c) {
            return true;
        }
    }

    /**
     * No decomposition/composition.
     * @stable ICU 2.8
     */
    public static final Mode NONE = new Mode(1);

    /**
     * Canonical decomposition.
     * @stable ICU 2.8
     */
    public static final Mode NFD = new NFDMode(2);

    private static final class NFDMode extends Mode {
        private NFDMode(int value) {
            super(value);
        }

        protected int normalize(char[] src, int srcStart, int srcLimit,
                                char[] dest,int destStart,int destLimit,
                                UnicodeSet nx) {
            int[] trailCC = new int[1];
            return NormalizerImpl.decompose(src,  srcStart,srcLimit,
                                            dest, destStart,destLimit,
                                            false, trailCC,nx);
        }

        protected String normalize( String src, int options) {
            return decompose(src,false,options);
        }

        protected int getMinC() {
            return NormalizerImpl.MIN_WITH_LEAD_CC;
        }

        protected IsPrevBoundary getPrevBoundary() {
            return new IsPrevNFDSafe();
        }

        protected IsNextBoundary getNextBoundary() {
            return new IsNextNFDSafe();
        }

        protected int getMask() {
            return (NormalizerImpl.CC_MASK|NormalizerImpl.QC_NFD);
        }

        protected QuickCheckResult quickCheck(char[] src,int start,
                                              int limit,boolean allowMaybe,
                                              UnicodeSet nx) {
            return NormalizerImpl.quickCheck(
                                             src, start,limit,
                                             NormalizerImpl.getFromIndexesArr(
                                                                              NormalizerImpl.INDEX_MIN_NFD_NO_MAYBE
                                                                              ),
                                             NormalizerImpl.QC_NFD,
                                             0,
                                             allowMaybe,
                                             nx
                                             );
        }

        protected boolean isNFSkippable(int c) {
            return NormalizerImpl.isNFSkippable(c,this,
                                                (NormalizerImpl.CC_MASK|NormalizerImpl.QC_NFD)
                                                );
        }
    }

    /**
     * Compatibility decomposition.
     * @stable ICU 2.8
     */
    public static final Mode NFKD = new NFKDMode(3);

    private static final class NFKDMode extends Mode {
        private NFKDMode(int value) {
            super(value);
        }

        protected int normalize(char[] src, int srcStart, int srcLimit,
                                char[] dest,int destStart,int destLimit,
                                UnicodeSet nx) {
            int[] trailCC = new int[1];
            return NormalizerImpl.decompose(src,  srcStart,srcLimit,
                                            dest, destStart,destLimit,
                                            true, trailCC, nx);
        }

        protected String normalize( String src, int options) {
            return decompose(src,true,options);
        }

        protected int getMinC() {
            return NormalizerImpl.MIN_WITH_LEAD_CC;
        }

        protected IsPrevBoundary getPrevBoundary() {
            return new IsPrevNFDSafe();
        }

        protected IsNextBoundary getNextBoundary() {
            return new IsNextNFDSafe();
        }

        protected int getMask() {
            return (NormalizerImpl.CC_MASK|NormalizerImpl.QC_NFKD);
        }

        protected QuickCheckResult quickCheck(char[] src,int start,
                                              int limit,boolean allowMaybe,
                                              UnicodeSet nx) {
            return NormalizerImpl.quickCheck(
                                             src,start,limit,
                                             NormalizerImpl.getFromIndexesArr(
                                                                              NormalizerImpl.INDEX_MIN_NFKD_NO_MAYBE
                                                                              ),
                                             NormalizerImpl.QC_NFKD,
                                             NormalizerImpl.OPTIONS_COMPAT,
                                             allowMaybe,
                                             nx
                                             );
        }

        protected boolean isNFSkippable(int c) {
            return NormalizerImpl.isNFSkippable(c, this,
                                                (NormalizerImpl.CC_MASK|NormalizerImpl.QC_NFKD)
                                                );
        }
    }

    /**
     * Canonical decomposition followed by canonical composition.
     * @stable ICU 2.8
     */
    public static final Mode NFC = new NFCMode(4);

    private static final class NFCMode extends Mode{
        private NFCMode(int value) {
            super(value);
        }
        protected int normalize(char[] src, int srcStart, int srcLimit,
                                char[] dest,int destStart,int destLimit,
                                UnicodeSet nx) {
            return NormalizerImpl.compose( src, srcStart, srcLimit,
                                           dest,destStart,destLimit,
                                           0, nx);
        }

        protected String normalize( String src, int options) {
            return compose(src, false, options);
        }

        protected int getMinC() {
            return NormalizerImpl.getFromIndexesArr(
                                                    NormalizerImpl.INDEX_MIN_NFC_NO_MAYBE
                                                    );
        }
        protected IsPrevBoundary getPrevBoundary() {
            return new IsPrevTrueStarter();
        }
        protected IsNextBoundary getNextBoundary() {
            return new IsNextTrueStarter();
        }
        protected int getMask() {
            return (NormalizerImpl.CC_MASK|NormalizerImpl.QC_NFC);
        }
        protected QuickCheckResult quickCheck(char[] src,int start,
                                              int limit,boolean allowMaybe,
                                              UnicodeSet nx) {
            return NormalizerImpl.quickCheck(
                                             src,start,limit,
                                             NormalizerImpl.getFromIndexesArr(
                                                                              NormalizerImpl.INDEX_MIN_NFC_NO_MAYBE
                                                                              ),
                                             NormalizerImpl.QC_NFC,
                                             0,
                                             allowMaybe,
                                             nx
                                             );
        }
        protected boolean isNFSkippable(int c) {
            return NormalizerImpl.isNFSkippable(c,this,
                                                ( NormalizerImpl.CC_MASK|NormalizerImpl.COMBINES_ANY|
                                                  (NormalizerImpl.QC_NFC & NormalizerImpl.QC_ANY_NO)
                                                  )
                                                );
        }
    };

    /**
     * Compatibility decomposition followed by canonical composition.
     * @stable ICU 2.8
     */
    public static final Mode NFKC =new NFKCMode(5);

    private static final class NFKCMode extends Mode{
        private NFKCMode(int value) {
            super(value);
        }
        protected int normalize(char[] src, int srcStart, int srcLimit,
                                char[] dest,int destStart,int destLimit,
                                UnicodeSet nx) {
            return NormalizerImpl.compose(src,  srcStart,srcLimit,
                                          dest, destStart,destLimit,
                                          NormalizerImpl.OPTIONS_COMPAT, nx);
        }

        protected String normalize( String src, int options) {
            return compose(src, true, options);
        }
        protected int getMinC() {
            return NormalizerImpl.getFromIndexesArr(
                                                    NormalizerImpl.INDEX_MIN_NFKC_NO_MAYBE
                                                    );
        }
        protected IsPrevBoundary getPrevBoundary() {
            return new IsPrevTrueStarter();
        }
        protected IsNextBoundary getNextBoundary() {
            return new IsNextTrueStarter();
        }
        protected int getMask() {
            return (NormalizerImpl.CC_MASK|NormalizerImpl.QC_NFKC);
        }
        protected QuickCheckResult quickCheck(char[] src,int start,
                                              int limit,boolean allowMaybe,
                                              UnicodeSet nx) {
            return NormalizerImpl.quickCheck(
                                             src,start,limit,
                                             NormalizerImpl.getFromIndexesArr(
                                                                              NormalizerImpl.INDEX_MIN_NFKC_NO_MAYBE
                                                                              ),
                                             NormalizerImpl.QC_NFKC,
                                             NormalizerImpl.OPTIONS_COMPAT,
                                             allowMaybe,
                                             nx
                                             );
        }
        protected boolean isNFSkippable(int c) {
            return NormalizerImpl.isNFSkippable(c, this,
                                                ( NormalizerImpl.CC_MASK|NormalizerImpl.COMBINES_ANY|
                                                  (NormalizerImpl.QC_NFKC & NormalizerImpl.QC_ANY_NO)
                                                  )
                                                );
        }
    };

    /**
     * Result values for quickCheck().
     * For details see Unicode Technical Report 15.
     * @stable ICU 2.8
     */
    public static final class QuickCheckResult{
        private int resultValue;
        private QuickCheckResult(int value) {
            resultValue=value;
        }
    }
    /**
     * Indicates that string is not in the normalized format
     * @stable ICU 2.8
     */
    public static final QuickCheckResult NO = new QuickCheckResult(0);

    /**
     * Indicates that string is in the normalized format
     * @stable ICU 2.8
     */
    public static final QuickCheckResult YES = new QuickCheckResult(1);

    /**
     * Indicates it cannot be determined if string is in the normalized
     * format without further thorough checks.
     * @stable ICU 2.8
     */
    public static final QuickCheckResult MAYBE = new QuickCheckResult(2);

    //-------------------------------------------------------------------------
    // Constructors
    //-------------------------------------------------------------------------

    /**
     * Creates a new <tt>Normalizer</tt> object for iterating over the
     * normalized form of a given string.
     * <p>
     * The <tt>options</tt> parameter specifies which optional
     * <tt>Normalizer</tt> features are to be enabled for this object.
     * <p>
     * @param str  The string to be normalized.  The normalization
     *              will start at the beginning of the string.
     *
     * @param mode The normalization mode.
     *
     * @param opt Any optional features to be enabled.
     *            Currently the only available option is {@link #UNICODE_3_2}.
     *            If you want the default behavior corresponding to one of the
     *            standard Unicode Normalization Forms, use 0 for this argument.
     * @stable ICU 2.6
     */
    public NormalizerBase(String str, Mode mode, int opt) {
        this.text = UCharacterIterator.getInstance(str);
        this.mode = mode;
        this.options=opt;
    }

    /**
     * Creates a new <tt>Normalizer</tt> object for iterating over the
     * normalized form of the given text.
     * <p>
     * @param iter  The input text to be normalized.  The normalization
     *              will start at the beginning of the string.
     *
     * @param mode  The normalization mode.
     */
    public NormalizerBase(CharacterIterator iter, Mode mode) {
          this(iter, mode, UNICODE_LATEST);
    }

    /**
     * Creates a new <tt>Normalizer</tt> object for iterating over the
     * normalized form of the given text.
     * <p>
     * @param iter  The input text to be normalized.  The normalization
     *              will start at the beginning of the string.
     *
     * @param mode  The normalization mode.
     *
     * @param opt Any optional features to be enabled.
     *            Currently the only available option is {@link #UNICODE_3_2}.
     *            If you want the default behavior corresponding to one of the
     *            standard Unicode Normalization Forms, use 0 for this argument.
     * @stable ICU 2.6
     */
    public NormalizerBase(CharacterIterator iter, Mode mode, int opt) {
        this.text = UCharacterIterator.getInstance(
                                                   (CharacterIterator)iter.clone()
                                                   );
        this.mode = mode;
        this.options = opt;
    }

    /**
     * Clones this <tt>Normalizer</tt> object.  All properties of this
     * object are duplicated in the new object, including the cloning of any
     * {@link CharacterIterator} that was passed in to the constructor
     * or to {@link #setText(CharacterIterator) setText}.
     * However, the text storage underlying
     * the <tt>CharacterIterator</tt> is not duplicated unless the
     * iterator's <tt>clone</tt> method does so.
     * @stable ICU 2.8
     */
    public Object clone() {
        try {
            NormalizerBase copy = (NormalizerBase) super.clone();
            copy.text = (UCharacterIterator) text.clone();
            //clone the internal buffer
            if (buffer != null) {
                copy.buffer = new char[buffer.length];
                System.arraycopy(buffer,0,copy.buffer,0,buffer.length);
            }
            return copy;
        }
        catch (CloneNotSupportedException e) {
            throw new InternalError(e.toString(), e);
        }
    }

    //--------------------------------------------------------------------------
    // Static Utility methods
    //--------------------------------------------------------------------------

    /**
     * Compose a string.
     * The string will be composed to according the the specified mode.
     * @param str        The string to compose.
     * @param compat     If true the string will be composed accoding to
     *                    NFKC rules and if false will be composed according to
     *                    NFC rules.
     * @param options    The only recognized option is UNICODE_3_2
     * @return String    The composed string
     * @stable ICU 2.6
     */
    public static String compose(String str, boolean compat, int options) {

        char[] dest, src;
        if (options == UNICODE_3_2_0_ORIGINAL) {
            String mappedStr = NormalizerImpl.convert(str);
            dest = new char[mappedStr.length()*MAX_BUF_SIZE_COMPOSE];
            src = mappedStr.toCharArray();
        } else {
            dest = new char[str.length()*MAX_BUF_SIZE_COMPOSE];
            src = str.toCharArray();
        }
        int destSize=0;

        UnicodeSet nx = NormalizerImpl.getNX(options);

        /* reset options bits that should only be set here or inside compose() */
        options&=~(NormalizerImpl.OPTIONS_SETS_MASK|NormalizerImpl.OPTIONS_COMPAT|NormalizerImpl.OPTIONS_COMPOSE_CONTIGUOUS);

        if(compat) {
            options|=NormalizerImpl.OPTIONS_COMPAT;
        }

        for(;;) {
            destSize=NormalizerImpl.compose(src,0,src.length,
                                            dest,0,dest.length,options,
                                            nx);
            if(destSize<=dest.length) {
                return new String(dest,0,destSize);
            } else {
                dest = new char[destSize];
            }
        }
    }

    private static final int MAX_BUF_SIZE_COMPOSE = 2;
    private static final int MAX_BUF_SIZE_DECOMPOSE = 3;

    /**
     * Decompose a string.
     * The string will be decomposed to according the the specified mode.
     * @param str       The string to decompose.
     * @param compat    If true the string will be decomposed accoding to NFKD
     *                   rules and if false will be decomposed according to NFD
     *                   rules.
     * @return String   The decomposed string
     * @stable ICU 2.8
     */
    public static String decompose(String str, boolean compat) {
        return decompose(str,compat,UNICODE_LATEST);
    }

    /**
     * Decompose a string.
     * The string will be decomposed to according the the specified mode.
     * @param str     The string to decompose.
     * @param compat  If true the string will be decomposed accoding to NFKD
     *                 rules and if false will be decomposed according to NFD
     *                 rules.
     * @param options The normalization options, ORed together (0 for no options).
     * @return String The decomposed string
     * @stable ICU 2.6
     */
    public static String decompose(String str, boolean compat, int options) {

        int[] trailCC = new int[1];
        int destSize=0;
        UnicodeSet nx = NormalizerImpl.getNX(options);
        char[] dest;

        if (options == UNICODE_3_2_0_ORIGINAL) {
            String mappedStr = NormalizerImpl.convert(str);
            dest = new char[mappedStr.length()*MAX_BUF_SIZE_DECOMPOSE];

            for(;;) {
                destSize=NormalizerImpl.decompose(mappedStr.toCharArray(),0,mappedStr.length(),
                                                  dest,0,dest.length,
                                                  compat,trailCC, nx);
                if(destSize<=dest.length) {
                    return new String(dest,0,destSize);
                } else {
                    dest = new char[destSize];
                }
            }
        } else {
            dest = new char[str.length()*MAX_BUF_SIZE_DECOMPOSE];

            for(;;) {
                destSize=NormalizerImpl.decompose(str.toCharArray(),0,str.length(),
                                                  dest,0,dest.length,
                                                  compat,trailCC, nx);
                if(destSize<=dest.length) {
                    return new String(dest,0,destSize);
                } else {
                    dest = new char[destSize];
                }
            }
        }
    }

    /**
     * Normalize a string.
     * The string will be normalized according the the specified normalization
     * mode and options.
     * @param src       The char array to compose.
     * @param srcStart  Start index of the source
     * @param srcLimit  Limit index of the source
     * @param dest      The char buffer to fill in
     * @param destStart Start index of the destination buffer
     * @param destLimit End index of the destination buffer
     * @param mode      The normalization mode; one of Normalizer.NONE,
     *                   Normalizer.NFD, Normalizer.NFC, Normalizer.NFKC,
     *                   Normalizer.NFKD, Normalizer.DEFAULT
     * @param options The normalization options, ORed together (0 for no options).
     * @return int      The total buffer size needed;if greater than length of
     *                   result, the output was truncated.
     * @exception       IndexOutOfBoundsException if the target capacity is
     *                   less than the required length
     * @stable ICU 2.6
     */
    public static int normalize(char[] src,int srcStart, int srcLimit,
                                char[] dest,int destStart, int destLimit,
                                Mode  mode, int options) {
        int length = mode.normalize(src,srcStart,srcLimit,dest,destStart,destLimit, options);

        if(length<=(destLimit-destStart)) {
            return length;
        } else {
            throw new IndexOutOfBoundsException(Integer.toString(length));
        }
    }

    //-------------------------------------------------------------------------
    // Iteration API
    //-------------------------------------------------------------------------

    /**
     * Return the current character in the normalized text->
     * @return The codepoint as an int
     * @stable ICU 2.8
     */
    public int current() {
        if(bufferPos<bufferLimit || nextNormalize()) {
            return getCodePointAt(bufferPos);
        } else {
            return DONE;
        }
    }

    /**
     * Return the next character in the normalized text and advance
     * the iteration position by one.  If the end
     * of the text has already been reached, {@link #DONE} is returned.
     * @return The codepoint as an int
     * @stable ICU 2.8
     */
    public int next() {
        if(bufferPos<bufferLimit ||  nextNormalize()) {
            int c=getCodePointAt(bufferPos);
            bufferPos+=(c>0xFFFF) ? 2 : 1;
            return c;
        } else {
            return DONE;
        }
    }


    /**
     * Return the previous character in the normalized text and decrement
     * the iteration position by one.  If the beginning
     * of the text has already been reached, {@link #DONE} is returned.
     * @return The codepoint as an int
     * @stable ICU 2.8
     */
    public int previous() {
        if(bufferPos>0 || previousNormalize()) {
            int c=getCodePointAt(bufferPos-1);
            bufferPos-=(c>0xFFFF) ? 2 : 1;
            return c;
        } else {
            return DONE;
        }
    }

    /**
     * Reset the index to the beginning of the text.
     * This is equivalent to setIndexOnly(startIndex)).
     * @stable ICU 2.8
     */
    public void reset() {
        text.setIndex(0);
        currentIndex=nextIndex=0;
        clearBuffer();
    }

    /**
     * Set the iteration position in the input text that is being normalized,
     * without any immediate normalization.
     * After setIndexOnly(), getIndex() will return the same index that is
     * specified here.
     *
     * @param index the desired index in the input text.
     * @stable ICU 2.8
     */
    public void setIndexOnly(int index) {
        text.setIndex(index);
        currentIndex=nextIndex=index; // validates index
        clearBuffer();
    }

    /**
     * Set the iteration position in the input text that is being normalized
     * and return the first normalized character at that position.
     * <p>
     * <b>Note:</b> This method sets the position in the <em>input</em> text,
     * while {@link #next} and {@link #previous} iterate through characters
     * in the normalized <em>output</em>.  This means that there is not
     * necessarily a one-to-one correspondence between characters returned
     * by <tt>next</tt> and <tt>previous</tt> and the indices passed to and
     * returned from <tt>setIndex</tt> and {@link #getIndex}.
     * <p>
     * @param index the desired index in the input text->
     *
     * @return   the first normalized character that is the result of iterating
     *            forward starting at the given index.
     *
     * @throws IllegalArgumentException if the given index is less than
     *          {@link #getBeginIndex} or greater than {@link #getEndIndex}.
     * @return The codepoint as an int
     * @deprecated ICU 3.2
     * @obsolete ICU 3.2
     */
     @Deprecated
     public int setIndex(int index) {
         setIndexOnly(index);
         return current();
     }

    /**
     * Retrieve the index of the start of the input text. This is the begin
     * index of the <tt>CharacterIterator</tt> or the start (i.e. 0) of the
     * <tt>String</tt> over which this <tt>Normalizer</tt> is iterating
     * @deprecated ICU 2.2. Use startIndex() instead.
     * @return The codepoint as an int
     * @see #startIndex
     */
    @Deprecated
    public int getBeginIndex() {
        return 0;
    }

    /**
     * Retrieve the index of the end of the input text.  This is the end index
     * of the <tt>CharacterIterator</tt> or the length of the <tt>String</tt>
     * over which this <tt>Normalizer</tt> is iterating
     * @deprecated ICU 2.2. Use endIndex() instead.
     * @return The codepoint as an int
     * @see #endIndex
     */
    @Deprecated
    public int getEndIndex() {
        return endIndex();
    }

    /**
     * Retrieve the current iteration position in the input text that is
     * being normalized.  This method is useful in applications such as
     * searching, where you need to be able to determine the position in
     * the input text that corresponds to a given normalized output character.
     * <p>
     * <b>Note:</b> This method sets the position in the <em>input</em>, while
     * {@link #next} and {@link #previous} iterate through characters in the
     * <em>output</em>.  This means that there is not necessarily a one-to-one
     * correspondence between characters returned by <tt>next</tt> and
     * <tt>previous</tt> and the indices passed to and returned from
     * <tt>setIndex</tt> and {@link #getIndex}.
     * @return The current iteration position
     * @stable ICU 2.8
     */
    public int getIndex() {
        if(bufferPos<bufferLimit) {
            return currentIndex;
        } else {
            return nextIndex;
        }
    }

    /**
     * Retrieve the index of the end of the input text->  This is the end index
     * of the <tt>CharacterIterator</tt> or the length of the <tt>String</tt>
     * over which this <tt>Normalizer</tt> is iterating
     * @return The current iteration position
     * @stable ICU 2.8
     */
    public int endIndex() {
        return text.getLength();
    }

    //-------------------------------------------------------------------------
    // Property access methods
    //-------------------------------------------------------------------------
    /**
     * Set the normalization mode for this object.
     * <p>
     * <b>Note:</b>If the normalization mode is changed while iterating
     * over a string, calls to {@link #next} and {@link #previous} may
     * return previously buffers characters in the old normalization mode
     * until the iteration is able to re-sync at the next base character.
     * It is safest to call {@link #setText setText()}, {@link #first},
     * {@link #last}, etc. after calling <tt>setMode</tt>.
     * <p>
     * @param newMode the new mode for this <tt>Normalizer</tt>.
     * The supported modes are:
     * <ul>
     *  <li>{@link #COMPOSE}        - Unicode canonical decompositiion
     *                                  followed by canonical composition.
     *  <li>{@link #COMPOSE_COMPAT} - Unicode compatibility decompositiion
     *                                  follwed by canonical composition.
     *  <li>{@link #DECOMP}         - Unicode canonical decomposition
     *  <li>{@link #DECOMP_COMPAT}  - Unicode compatibility decomposition.
     *  <li>{@link #NO_OP}          - Do nothing but return characters
     *                                  from the underlying input text.
     * </ul>
     *
     * @see #getMode
     * @stable ICU 2.8
     */
    public void setMode(Mode newMode) {
        mode = newMode;
    }
    /**
     * Return the basic operation performed by this <tt>Normalizer</tt>
     *
     * @see #setMode
     * @stable ICU 2.8
     */
    public Mode getMode() {
        return mode;
    }

    /**
     * Set the input text over which this <tt>Normalizer</tt> will iterate.
     * The iteration position is set to the beginning of the input text->
     * @param newText   The new string to be normalized.
     * @stable ICU 2.8
     */
    public void setText(String newText) {

        UCharacterIterator newIter = UCharacterIterator.getInstance(newText);
        if (newIter == null) {
            throw new InternalError("Could not create a new UCharacterIterator");
        }
        text = newIter;
        reset();
    }

    /**
     * Set the input text over which this <tt>Normalizer</tt> will iterate.
     * The iteration position is set to the beginning of the input text->
     * @param newText   The new string to be normalized.
     * @stable ICU 2.8
     */
    public void setText(CharacterIterator newText) {

        UCharacterIterator newIter = UCharacterIterator.getInstance(newText);
        if (newIter == null) {
            throw new InternalError("Could not create a new UCharacterIterator");
        }
        text = newIter;
        currentIndex=nextIndex=0;
        clearBuffer();
    }

    //-------------------------------------------------------------------------
    // Private utility methods
    //-------------------------------------------------------------------------


    /* backward iteration --------------------------------------------------- */

    /*
     * read backwards and get norm32
     * return 0 if the character is <minC
     * if c2!=0 then (c2, c) is a surrogate pair (reversed - c2 is first
     * surrogate but read second!)
     */

    private static  long getPrevNorm32(UCharacterIterator src,
                                       int/*unsigned*/ minC,
                                       int/*unsigned*/ mask,
                                       char[] chars) {
        long norm32;
        int ch=0;
        /* need src.hasPrevious() */
        if((ch=src.previous()) == UCharacterIterator.DONE) {
            return 0;
        }
        chars[0]=(char)ch;
        chars[1]=0;

        /* check for a surrogate before getting norm32 to see if we need to
         * predecrement further */
        if(chars[0]<minC) {
            return 0;
        } else if(!UTF16.isSurrogate(chars[0])) {
            return NormalizerImpl.getNorm32(chars[0]);
        } else if(UTF16.isLeadSurrogate(chars[0]) || (src.getIndex()==0)) {
            /* unpaired surrogate */
            chars[1]=(char)src.current();
            return 0;
        } else if(UTF16.isLeadSurrogate(chars[1]=(char)src.previous())) {
            norm32=NormalizerImpl.getNorm32(chars[1]);
            if((norm32&mask)==0) {
                /* all surrogate pairs with this lead surrogate have irrelevant
                 * data */
                return 0;
            } else {
                /* norm32 must be a surrogate special */
                return NormalizerImpl.getNorm32FromSurrogatePair(norm32,chars[0]);
            }
        } else {
            /* unpaired second surrogate, undo the c2=src.previous() movement */
            src.moveIndex( 1);
            return 0;
        }
    }

    private interface IsPrevBoundary{
        public boolean isPrevBoundary(UCharacterIterator src,
                                      int/*unsigned*/ minC,
                                      int/*unsigned*/ mask,
                                      char[] chars);
    }
    private static final class IsPrevNFDSafe implements IsPrevBoundary{
        /*
         * for NF*D:
         * read backwards and check if the lead combining class is 0
         * if c2!=0 then (c2, c) is a surrogate pair (reversed - c2 is first
         * surrogate but read second!)
         */
        public boolean isPrevBoundary(UCharacterIterator src,
                                      int/*unsigned*/ minC,
                                      int/*unsigned*/ ccOrQCMask,
                                      char[] chars) {

            return NormalizerImpl.isNFDSafe(getPrevNorm32(src, minC,
                                                          ccOrQCMask, chars),
                                            ccOrQCMask,
                                            ccOrQCMask& NormalizerImpl.QC_MASK);
        }
    }

    private static final class IsPrevTrueStarter implements IsPrevBoundary{
        /*
         * read backwards and check if the character is (or its decomposition
         * begins with) a "true starter" (cc==0 and NF*C_YES)
         * if c2!=0 then (c2, c) is a surrogate pair (reversed - c2 is first
         * surrogate but read second!)
         */
        public boolean isPrevBoundary(UCharacterIterator src,
                                      int/*unsigned*/ minC,
                                      int/*unsigned*/ ccOrQCMask,
                                      char[] chars) {
            long norm32;
            int/*unsigned*/ decompQCMask;

            decompQCMask=(ccOrQCMask<<2)&0xf; /*decomposition quick check mask*/
            norm32=getPrevNorm32(src, minC, ccOrQCMask|decompQCMask, chars);
            return NormalizerImpl.isTrueStarter(norm32,ccOrQCMask,decompQCMask);
        }
    }

    private static int findPreviousIterationBoundary(UCharacterIterator src,
                                                     IsPrevBoundary obj,
                                                     int/*unsigned*/ minC,
                                                     int/*mask*/ mask,
                                                     char[] buffer,
                                                     int[] startIndex) {
        char[] chars=new char[2];
        boolean isBoundary;

        /* fill the buffer from the end backwards */
        startIndex[0] = buffer.length;
        chars[0]=0;
        while(src.getIndex()>0 && chars[0]!=UCharacterIterator.DONE) {
            isBoundary=obj.isPrevBoundary(src, minC, mask, chars);

            /* always write this character to the front of the buffer */
            /* make sure there is enough space in the buffer */
            if(startIndex[0] < (chars[1]==0 ? 1 : 2)) {

                // grow the buffer
                char[] newBuf = new char[buffer.length*2];
                /* move the current buffer contents up */
                System.arraycopy(buffer,startIndex[0],newBuf,
                                 newBuf.length-(buffer.length-startIndex[0]),
                                 buffer.length-startIndex[0]);
                //adjust the startIndex
                startIndex[0]+=newBuf.length-buffer.length;

                buffer=newBuf;
                newBuf=null;

            }

            buffer[--startIndex[0]]=chars[0];
            if(chars[1]!=0) {
                buffer[--startIndex[0]]=chars[1];
            }

            /* stop if this just-copied character is a boundary */
            if(isBoundary) {
                break;
            }
        }

        /* return the length of the buffer contents */
        return buffer.length-startIndex[0];
    }

    private static int previous(UCharacterIterator src,
                                char[] dest, int destStart, int destLimit,
                                Mode mode,
                                boolean doNormalize,
                                boolean[] pNeededToNormalize,
                                int options) {

        IsPrevBoundary isPreviousBoundary;
        int destLength, bufferLength;
        int/*unsigned*/ mask;
        int c,c2;

        char minC;
        int destCapacity = destLimit-destStart;
        destLength=0;

        if(pNeededToNormalize!=null) {
            pNeededToNormalize[0]=false;
        }
        minC = (char)mode.getMinC();
        mask = mode.getMask();
        isPreviousBoundary = mode.getPrevBoundary();

        if(isPreviousBoundary==null) {
            destLength=0;
            if((c=src.previous())>=0) {
                destLength=1;
                if(UTF16.isTrailSurrogate((char)c)) {
                    c2= src.previous();
                    if(c2!= UCharacterIterator.DONE) {
                        if(UTF16.isLeadSurrogate((char)c2)) {
                            if(destCapacity>=2) {
                                dest[1]=(char)c; // trail surrogate
                                destLength=2;
                            }
                            // lead surrogate to be written below
                            c=c2;
                        } else {
                            src.moveIndex(1);
                        }
                    }
                }

                if(destCapacity>0) {
                    dest[0]=(char)c;
                }
            }
            return destLength;
        }

        char[] buffer = new char[100];
        int[] startIndex= new int[1];
        bufferLength=findPreviousIterationBoundary(src,
                                                   isPreviousBoundary,
                                                   minC, mask,buffer,
                                                   startIndex);
        if(bufferLength>0) {
            if(doNormalize) {
                destLength=NormalizerBase.normalize(buffer,startIndex[0],
                                                startIndex[0]+bufferLength,
                                                dest, destStart,destLimit,
                                                mode, options);

                if(pNeededToNormalize!=null) {
                    pNeededToNormalize[0]=destLength!=bufferLength ||
                                          Utility.arrayRegionMatches(
                                            buffer,0,dest,
                                            destStart,destLimit
                                          );
                }
            } else {
                /* just copy the source characters */
                if(destCapacity>0) {
                    System.arraycopy(buffer,startIndex[0],dest,0,
                                     (bufferLength<destCapacity) ?
                                     bufferLength : destCapacity
                                     );
                }
            }
        }


        return destLength;
    }



    /* forward iteration ---------------------------------------------------- */
    /*
     * read forward and check if the character is a next-iteration boundary
     * if c2!=0 then (c, c2) is a surrogate pair
     */
    private interface IsNextBoundary{
        boolean isNextBoundary(UCharacterIterator src,
                               int/*unsigned*/ minC,
                               int/*unsigned*/ mask,
                               int[] chars);
    }
    /*
     * read forward and get norm32
     * return 0 if the character is <minC
     * if c2!=0 then (c2, c) is a surrogate pair
     * always reads complete characters
     */
    private static long /*unsigned*/ getNextNorm32(UCharacterIterator src,
                                                   int/*unsigned*/ minC,
                                                   int/*unsigned*/ mask,
                                                   int[] chars) {
        long norm32;

        /* need src.hasNext() to be true */
        chars[0]=src.next();
        chars[1]=0;

        if(chars[0]<minC) {
            return 0;
        }

        norm32=NormalizerImpl.getNorm32((char)chars[0]);
        if(UTF16.isLeadSurrogate((char)chars[0])) {
            if(src.current()!=UCharacterIterator.DONE &&
               UTF16.isTrailSurrogate((char)(chars[1]=src.current()))) {
                src.moveIndex(1); /* skip the c2 surrogate */
                if((norm32&mask)==0) {
                    /* irrelevant data */
                    return 0;
                } else {
                    /* norm32 must be a surrogate special */
                    return NormalizerImpl.getNorm32FromSurrogatePair(norm32,(char)chars[1]);
                }
            } else {
                /* unmatched surrogate */
                return 0;
            }
        }
        return norm32;
    }


    /*
     * for NF*D:
     * read forward and check if the lead combining class is 0
     * if c2!=0 then (c, c2) is a surrogate pair
     */
    private static final class IsNextNFDSafe implements IsNextBoundary{
        public boolean isNextBoundary(UCharacterIterator src,
                                      int/*unsigned*/ minC,
                                      int/*unsigned*/ ccOrQCMask,
                                      int[] chars) {
            return NormalizerImpl.isNFDSafe(getNextNorm32(src,minC,ccOrQCMask,chars),
                                            ccOrQCMask, ccOrQCMask&NormalizerImpl.QC_MASK);
        }
    }

    /*
     * for NF*C:
     * read forward and check if the character is (or its decomposition begins
     * with) a "true starter" (cc==0 and NF*C_YES)
     * if c2!=0 then (c, c2) is a surrogate pair
     */
    private static final class IsNextTrueStarter implements IsNextBoundary{
        public boolean isNextBoundary(UCharacterIterator src,
                                      int/*unsigned*/ minC,
                                      int/*unsigned*/ ccOrQCMask,
                                      int[] chars) {
            long norm32;
            int/*unsigned*/ decompQCMask;

            decompQCMask=(ccOrQCMask<<2)&0xf; /*decomposition quick check mask*/
            norm32=getNextNorm32(src, minC, ccOrQCMask|decompQCMask, chars);
            return NormalizerImpl.isTrueStarter(norm32, ccOrQCMask, decompQCMask);
        }
    }

    private static int findNextIterationBoundary(UCharacterIterator src,
                                                 IsNextBoundary obj,
                                                 int/*unsigned*/ minC,
                                                 int/*unsigned*/ mask,
                                                 char[] buffer) {
        if(src.current()==UCharacterIterator.DONE) {
            return 0;
        }

        /* get one character and ignore its properties */
        int[] chars = new int[2];
        chars[0]=src.next();
        buffer[0]=(char)chars[0];
        int bufferIndex = 1;

        if(UTF16.isLeadSurrogate((char)chars[0])&&
           src.current()!=UCharacterIterator.DONE) {
            if(UTF16.isTrailSurrogate((char)(chars[1]=src.next()))) {
                buffer[bufferIndex++]=(char)chars[1];
            } else {
                src.moveIndex(-1); /* back out the non-trail-surrogate */
            }
        }

        /* get all following characters until we see a boundary */
        /* checking hasNext() instead of c!=DONE on the off-chance that U+ffff
         * is part of the string */
        while( src.current()!=UCharacterIterator.DONE) {
            if(obj.isNextBoundary(src, minC, mask, chars)) {
                /* back out the latest movement to stop at the boundary */
                src.moveIndex(chars[1]==0 ? -1 : -2);
                break;
            } else {
                if(bufferIndex+(chars[1]==0 ? 1 : 2)<=buffer.length) {
                    buffer[bufferIndex++]=(char)chars[0];
                    if(chars[1]!=0) {
                        buffer[bufferIndex++]=(char)chars[1];
                    }
                } else {
                    char[] newBuf = new char[buffer.length*2];
                    System.arraycopy(buffer,0,newBuf,0,bufferIndex);
                    buffer = newBuf;
                    buffer[bufferIndex++]=(char)chars[0];
                    if(chars[1]!=0) {
                        buffer[bufferIndex++]=(char)chars[1];
                    }
                }
            }
        }

        /* return the length of the buffer contents */
        return bufferIndex;
    }

    private static int next(UCharacterIterator src,
                            char[] dest, int destStart, int destLimit,
                            NormalizerBase.Mode mode,
                            boolean doNormalize,
                            boolean[] pNeededToNormalize,
                            int options) {

        IsNextBoundary isNextBoundary;
        int /*unsigned*/ mask;
        int /*unsigned*/ bufferLength;
        int c,c2;
        char minC;
        int destCapacity = destLimit - destStart;
        int destLength = 0;
        if(pNeededToNormalize!=null) {
            pNeededToNormalize[0]=false;
        }

        minC = (char)mode.getMinC();
        mask = mode.getMask();
        isNextBoundary = mode.getNextBoundary();

        if(isNextBoundary==null) {
            destLength=0;
            c=src.next();
            if(c!=UCharacterIterator.DONE) {
                destLength=1;
                if(UTF16.isLeadSurrogate((char)c)) {
                    c2= src.next();
                    if(c2!= UCharacterIterator.DONE) {
                        if(UTF16.isTrailSurrogate((char)c2)) {
                            if(destCapacity>=2) {
                                dest[1]=(char)c2; // trail surrogate
                                destLength=2;
                            }
                            // lead surrogate to be written below
                        } else {
                            src.moveIndex(-1);
                        }
                    }
                }

                if(destCapacity>0) {
                    dest[0]=(char)c;
                }
            }
            return destLength;
        }

        char[] buffer=new char[100];
        int[] startIndex = new int[1];
        bufferLength=findNextIterationBoundary(src,isNextBoundary, minC, mask,
                                               buffer);
        if(bufferLength>0) {
            if(doNormalize) {
                destLength=mode.normalize(buffer,startIndex[0],bufferLength,
                                          dest,destStart,destLimit, options);

                if(pNeededToNormalize!=null) {
                    pNeededToNormalize[0]=destLength!=bufferLength ||
                                          Utility.arrayRegionMatches(buffer,startIndex[0],
                                            dest,destStart,
                                            destLength);
                }
            } else {
                /* just copy the source characters */
                if(destCapacity>0) {
                    System.arraycopy(buffer,0,dest,destStart,
                                     Math.min(bufferLength,destCapacity)
                                     );
                }


            }
        }
        return destLength;
    }

    private void clearBuffer() {
        bufferLimit=bufferStart=bufferPos=0;
    }

    private boolean nextNormalize() {

        clearBuffer();
        currentIndex=nextIndex;
        text.setIndex(nextIndex);

        bufferLimit=next(text,buffer,bufferStart,buffer.length,mode,true,null,options);

        nextIndex=text.getIndex();
        return (bufferLimit>0);
    }

    private boolean previousNormalize() {

        clearBuffer();
        nextIndex=currentIndex;
        text.setIndex(currentIndex);
        bufferLimit=previous(text,buffer,bufferStart,buffer.length,mode,true,null,options);

        currentIndex=text.getIndex();
        bufferPos = bufferLimit;
        return bufferLimit>0;
    }

    private int getCodePointAt(int index) {
        if( UTF16.isSurrogate(buffer[index])) {
            if(UTF16.isLeadSurrogate(buffer[index])) {
                if((index+1)<bufferLimit &&
                   UTF16.isTrailSurrogate(buffer[index+1])) {

/**代码未完, 请加载全部代码(NowJava.com).**/
展开阅读全文

关注时代Java

关注时代Java