?? re.java
字號:
/* * gnu/regexp/RE.java * Copyright (C) 1998-2001 Wes Biggs * * This library is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published * by the Free Software Foundation; either version 2.1 of the License, or * (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */package gnu.regexp;import java.io.InputStream;import java.io.Reader;import java.io.Serializable;import java.util.Locale;import java.util.PropertyResourceBundle;import java.util.ResourceBundle;import java.util.Vector;class IntPair implements Serializable { public int first, second;}class CharUnit implements Serializable { public char ch; public boolean bk;}/** * RE provides the user interface for compiling and matching regular * expressions. * <P> * A regular expression object (class RE) is compiled by constructing it * from a String, StringBuffer or character array, with optional * compilation flags (below) * and an optional syntax specification (see RESyntax; if not specified, * <code>RESyntax.RE_SYNTAX_PERL5</code> is used). * <P> * Once compiled, a regular expression object is reusable as well as * threadsafe: multiple threads can use the RE instance simultaneously * to match against different input text. * <P> * Various methods attempt to match input text against a compiled * regular expression. These methods are: * <LI><code>isMatch</code>: returns true if the input text in its * entirety matches the regular expression pattern. * <LI><code>getMatch</code>: returns the first match found in the * input text, or null if no match is found. * <LI><code>getAllMatches</code>: returns an array of all * non-overlapping matches found in the input text. If no matches are * found, the array is zero-length. * <LI><code>substitute</code>: substitute the first occurence of the * pattern in the input text with a replacement string (which may * include metacharacters $0-$9, see REMatch.substituteInto). * <LI><code>substituteAll</code>: same as above, but repeat for each * match before returning. * <LI><code>getMatchEnumeration</code>: returns an REMatchEnumeration * object that allows iteration over the matches (see * REMatchEnumeration for some reasons why you may want to do this * instead of using <code>getAllMatches</code>. * <P> * * These methods all have similar argument lists. The input can be a * String, a character array, a StringBuffer, a Reader or an * InputStream of some sort. Note that when using a Reader or * InputStream, the stream read position cannot be guaranteed after * attempting a match (this is not a bug, but a consequence of the way * regular expressions work). Using an REMatchEnumeration can * eliminate most positioning problems. * * <P> * * The optional index argument specifies the offset from the beginning * of the text at which the search should start (see the descriptions * of some of the execution flags for how this can affect positional * pattern operators). For a Reader or InputStream, this means an * offset from the current read position, so subsequent calls with the * same index argument on a Reader or an InputStream will not * necessarily access the same position on the stream, whereas * repeated searches at a given index in a fixed string will return * consistent results. * * <P> * You can optionally affect the execution environment by using a * combination of execution flags (constants listed below). * * <P> * All operations on a regular expression are performed in a * thread-safe manner. * * @author <A HREF="mailto:wes@cacas.org">Wes Biggs</A> * @version 1.1.5-dev, to be released */public class RE extends REToken { // This String will be returned by getVersion() private static final String VERSION = "1.1.5-dev"; // The localized strings are kept in a separate file private static ResourceBundle messages = PropertyResourceBundle.getBundle("gnu/regexp/MessagesBundle", Locale.getDefault()); // These are, respectively, the first and last tokens in our linked list // If there is only one token, firstToken == lastToken private REToken firstToken, lastToken; // This is the number of subexpressions in this regular expression, // with a minimum value of zero. Returned by getNumSubs() private int numSubs; /** Minimum length, in characters, of any possible match. */ private int minimumLength; /** * Compilation flag. Do not differentiate case. Subsequent * searches using this RE will be case insensitive. */ public static final int REG_ICASE = 2; /** * Compilation flag. The match-any-character operator (dot) * will match a newline character. When set this overrides the syntax * bit RE_DOT_NEWLINE (see RESyntax for details). This is equivalent to * the "/s" operator in Perl. */ public static final int REG_DOT_NEWLINE = 4; /** * Compilation flag. Use multiline mode. In this mode, the ^ and $ * anchors will match based on newlines within the input. This is * equivalent to the "/m" operator in Perl. */ public static final int REG_MULTILINE = 8; /** * Execution flag. * The match-beginning operator (^) will not match at the beginning * of the input string. Useful for matching on a substring when you * know the context of the input is such that position zero of the * input to the match test is not actually position zero of the text. * <P> * This example demonstrates the results of various ways of matching on * a substring. * <P> * <CODE> * String s = "food bar fool";<BR> * RE exp = new RE("^foo.");<BR> * REMatch m0 = exp.getMatch(s);<BR> * REMatch m1 = exp.getMatch(s.substring(8));<BR> * REMatch m2 = exp.getMatch(s.substring(8),0,RE.REG_NOTBOL); <BR> * REMatch m3 = exp.getMatch(s,8); <BR> * REMatch m4 = exp.getMatch(s,8,RE.REG_ANCHORINDEX); <BR> * <P> * // Results:<BR> * // m0.toString(): "food"<BR> * // m1.toString(): "fool"<BR> * // m2.toString(): null<BR> * // m3.toString(): null<BR> * // m4.toString(): "fool"<BR> * </CODE> */ public static final int REG_NOTBOL = 16; /** * Execution flag. * The match-end operator ($) does not match at the end * of the input string. Useful for matching on substrings. */ public static final int REG_NOTEOL = 32; /** * Execution flag. * When a match method is invoked that starts matching at a non-zero * index into the input, treat the input as if it begins at the index * given. The effect of this flag is that the engine does not "see" * any text in the input before the given index. This is useful so * that the match-beginning operator (^) matches not at position 0 * in the input string, but at the position the search started at * (based on the index input given to the getMatch function). See * the example under REG_NOTBOL. It also affects the use of the \< * and \b operators. */ public static final int REG_ANCHORINDEX = 64; /** * Execution flag. * The substitute and substituteAll methods will not attempt to * interpolate occurrences of $1-$9 in the replacement text with * the corresponding subexpressions. For example, you may want to * replace all matches of "one dollar" with "$1". */ public static final int REG_NO_INTERPOLATE = 128; /** Returns a string representing the version of the gnu.regexp package. */ public static final String version() { return VERSION; } // Retrieves a message from the ResourceBundle static final String getLocalizedMessage(String key) { return messages.getString(key); } /** * Constructs a regular expression pattern buffer without any compilation * flags set, and using the default syntax (RESyntax.RE_SYNTAX_PERL5). * * @param pattern A regular expression pattern, in the form of a String, * StringBuffer or char[]. Other input types will be converted to * strings using the toString() method. * @exception REException The input pattern could not be parsed. * @exception NullPointerException The pattern was null. */ public RE(Object pattern) throws REException { this(pattern,0,RESyntax.RE_SYNTAX_PERL5,0,0); } /** * Constructs a regular expression pattern buffer using the specified * compilation flags and the default syntax (RESyntax.RE_SYNTAX_PERL5). * * @param pattern A regular expression pattern, in the form of a String, * StringBuffer, or char[]. Other input types will be converted to * strings using the toString() method. * @param cflags The logical OR of any combination of the compilation flags listed above. * @exception REException The input pattern could not be parsed. * @exception NullPointerException The pattern was null. */ public RE(Object pattern, int cflags) throws REException { this(pattern,cflags,RESyntax.RE_SYNTAX_PERL5,0,0); } /** * Constructs a regular expression pattern buffer using the specified * compilation flags and regular expression syntax. * * @param pattern A regular expression pattern, in the form of a String, * StringBuffer, or char[]. Other input types will be converted to * strings using the toString() method. * @param cflags The logical OR of any combination of the compilation flags listed above. * @param syntax The type of regular expression syntax to use. * @exception REException The input pattern could not be parsed. * @exception NullPointerException The pattern was null. */ public RE(Object pattern, int cflags, RESyntax syntax) throws REException { this(pattern,cflags,syntax,0,0); } // internal constructor used for alternation private RE(REToken first, REToken last,int subs, int subIndex, int minLength) { super(subIndex); firstToken = first; lastToken = last; numSubs = subs; minimumLength = minLength; addToken(new RETokenEndSub(subIndex)); } private RE(Object patternObj, int cflags, RESyntax syntax, int myIndex, int nextSub) throws REException { super(myIndex); // Subexpression index of this token. initialize(patternObj, cflags, syntax, myIndex, nextSub); } // For use by subclasses protected RE() { super(0); } // The meat of construction protected void initialize(Object patternObj, int cflags, RESyntax syntax, int myIndex, int nextSub) throws REException { char[] pattern; if (patternObj instanceof String) { pattern = ((String) patternObj).toCharArray(); } else if (patternObj instanceof char[]) { pattern = (char[]) patternObj; } else if (patternObj instanceof StringBuffer) { pattern = new char [((StringBuffer) patternObj).length()]; ((StringBuffer) patternObj).getChars(0,pattern.length,pattern,0); } else { pattern = patternObj.toString().toCharArray(); } int pLength = pattern.length; numSubs = 0; // Number of subexpressions in this token. Vector branches = null; // linked list of tokens (sort of -- some closed loops can exist) firstToken = lastToken = null; // Precalculate these so we don't pay for the math every time we // need to access them. boolean insens = ((cflags & REG_ICASE) > 0); // Parse pattern into tokens. Does anyone know if it's more efficient // to use char[] than a String.charAt()? I'm assuming so. // index tracks the position in the char array int index = 0; // this will be the current parse character (pattern[index]) CharUnit unit = new CharUnit(); // This is used for {x,y} calculations IntPair minMax = new IntPair(); // Buffer a token so we can create a TokenRepeated, etc. REToken currentToken = null; char ch; while (index < pLength) { // read the next character unit (including backslash escapes) index = getCharUnit(pattern,index,unit); // ALTERNATION OPERATOR // \| or | (if RE_NO_BK_VBAR) or newline (if RE_NEWLINE_ALT) // not available if RE_LIMITED_OPS is set // TODO: the '\n' literal here should be a test against REToken.newline, // which unfortunately may be more than a single character. if ( ( (unit.ch == '|' && (syntax.get(RESyntax.RE_NO_BK_VBAR) ^ unit.bk)) || (syntax.get(RESyntax.RE_NEWLINE_ALT) && (unit.ch == '\n') && !unit.bk) ) && !syntax.get(RESyntax.RE_LIMITED_OPS)) { // make everything up to here be a branch. create vector if nec. addToken(currentToken); RE theBranch = new RE(firstToken, lastToken, numSubs, subIndex, minimumLength); minimumLength = 0; if (branches == null) { branches = new Vector(); } branches.addElement(theBranch); firstToken = lastToken = currentToken = null; }
?? 快捷鍵說明
復制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -