?? uurifactory.java
字號:
/* UURIFactory * * $Id: UURIFactory.java 5106 2007-05-01 00:07:29Z gojomo $ * * Created on July 16, 2004 * * Copyright (C) 2003 Internet Archive. * * This file is part of the Heritrix web crawler (crawler.archive.org). * * Heritrix is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser Public License as published by * the Free Software Foundation; either version 2.1 of the License, or * any later version. * * Heritrix is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser Public License for more details. * * You should have received a copy of the GNU Lesser Public License * along with Heritrix; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */package org.archive.net;import gnu.inet.encoding.IDNA;import gnu.inet.encoding.IDNAException;import it.unimi.dsi.mg4j.util.MutableString;import java.io.UnsupportedEncodingException;import java.util.Arrays;import java.util.BitSet;import java.util.logging.Level;import java.util.logging.Logger;import java.util.regex.Matcher;import java.util.regex.Pattern;import org.apache.commons.httpclient.URI;import org.apache.commons.httpclient.URIException;import org.archive.util.TextUtils;/** * Factory that returns UURIs. * * Does escaping and fixup on URIs massaging in accordance with RFC2396 * and to match browser practice. For example, it removes any * '..' if first thing in the path as per IE, converts backslashes to forward * slashes, and discards any 'fragment'/anchor portion of the URI. This * class will also fail URIs if they are longer than IE's allowed maximum * length. * * <p>TODO: Test logging. * * @author stack */public class UURIFactory extends URI { private static final long serialVersionUID = -6146295130382209042L; /** * Logging instance. */ private static Logger logger = Logger.getLogger(UURIFactory.class.getName()); /** * The single instance of this factory. */ private static final UURIFactory factory = new UURIFactory(); /** * RFC 2396-inspired regex. * * From the RFC Appendix B: * <pre> * URI Generic Syntax August 1998 * * B. Parsing a URI Reference with a Regular Expression * * As described in Section 4.3, the generic URI syntax is not sufficient * to disambiguate the components of some forms of URI. Since the * "greedy algorithm" described in that section is identical to the * disambiguation method used by POSIX regular expressions, it is * natural and commonplace to use a regular expression for parsing the * potential four components and fragment identifier of a URI reference. * * The following line is the regular expression for breaking-down a URI * reference into its components. * * ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))? * 12 3 4 5 6 7 8 9 * * The numbers in the second line above are only to assist readability; * they indicate the reference points for each subexpression (i.e., each * paired parenthesis). We refer to the value matched for subexpression * <n> as $<n>. For example, matching the above expression to * * http://www.ics.uci.edu/pub/ietf/uri/#Related * * results in the following subexpression matches: * * $1 = http: * $2 = http * $3 = //www.ics.uci.edu * $4 = www.ics.uci.edu * $5 = /pub/ietf/uri/ * $6 = <undefined> * $7 = <undefined> * $8 = #Related * $9 = Related * * where <undefined> indicates that the component is not present, as is * the case for the query component in the above example. Therefore, we * can determine the value of the four components and fragment as * * scheme = $2 * authority = $4 * path = $5 * query = $7 * fragment = $9 * </pre> * * -- * <p>Below differs from the rfc regex in that it has java escaping of * regex characters and we allow a URI made of a fragment only (Added extra * group so indexing is off by one after scheme). */ final static Pattern RFC2396REGEX = Pattern.compile( "^(([^:/?#]+):)?((//([^/?#]*))?([^?#]*)(\\?([^#]*))?)?(#(.*))?"); // 12 34 5 6 7 8 9 A // 2 1 54 6 87 3 A9 // 1: scheme // 2: scheme: // 3: //authority/path // 4: //authority // 5: authority // 6: path // 7: ?query // 8: query // 9: #fragment // A: fragment public static final String SLASHDOTDOTSLASH = "^(/\\.\\./)+"; public static final String SLASH = "/"; public static final String HTTP = "http"; public static final String HTTP_PORT = ":80"; public static final String HTTPS = "https"; public static final String HTTPS_PORT = ":443"; public static final String DOT = "."; public static final String EMPTY_STRING = ""; public static final String NBSP = "\u00A0"; public static final String SPACE = " "; public static final String ESCAPED_SPACE = "%20"; public static final String TRAILING_ESCAPED_SPACE = "^(.*)(%20)+$"; public static final String PIPE = "|"; public static final String PIPE_PATTERN = "\\|"; public static final String ESCAPED_PIPE = "%7C"; public static final String CIRCUMFLEX = "^"; public static final String CIRCUMFLEX_PATTERN = "\\^"; public static final String ESCAPED_CIRCUMFLEX = "%5E"; public static final String QUOT = "\""; public static final String ESCAPED_QUOT = "%22"; public static final String SQUOT = "'"; public static final String ESCAPED_SQUOT = "%27"; public static final String APOSTROPH = "`"; public static final String ESCAPED_APOSTROPH = "%60"; public static final String LSQRBRACKET = "["; public static final String LSQRBRACKET_PATTERN = "\\["; public static final String ESCAPED_LSQRBRACKET = "%5B"; public static final String RSQRBRACKET = "]"; public static final String RSQRBRACKET_PATTERN = "\\]"; public static final String ESCAPED_RSQRBRACKET = "%5D"; public static final String LCURBRACKET = "{"; public static final String LCURBRACKET_PATTERN = "\\{"; public static final String ESCAPED_LCURBRACKET = "%7B"; public static final String RCURBRACKET = "}"; public static final String RCURBRACKET_PATTERN = "\\}"; public static final String ESCAPED_RCURBRACKET = "%7D"; public static final String BACKSLASH = "\\"; public static final String BACKSLASH_PATTERN = "\\\\"; public static final String ESCAPED_BACKSLASH = "%5C"; public static final String STRAY_SPACING = "[\n\r\t]+"; public static final String IMPROPERESC_REPLACE = "%25$1"; public static final String IMPROPERESC = "%((?:[^\\p{XDigit}])|(?:.[^\\p{XDigit}])|(?:\\z))"; public static final String COMMERCIAL_AT = "@"; public static final char PERCENT_SIGN = '%'; public static final char COLON = ':'; /** * First percent sign in string followed by two hex chars. */ public static final String URI_HEX_ENCODING = "^[^%]*%[\\p{XDigit}][\\p{XDigit}].*"; /** * Authority port number regex. */ final static Pattern PORTREGEX = Pattern.compile("(.*:)([0-9]+)$"); /** * Characters we'll accept in the domain label part of a URI * authority: ASCII letters-digits-hyphen (LDH) plus underscore, * with single intervening '.' characters. * * (We accept '_' because DNS servers have tolerated for many * years counter to spec; we also accept dash patterns and ACE * prefixes that will be rejected by IDN-punycoding attempt.) */ final static String ACCEPTABLE_ASCII_DOMAIN = "^(?:[a-zA-Z0-9_-]++(?:\\.)?)++$"; /** * Pattern that looks for case of three or more slashes after the * scheme. If found, we replace them with two only as mozilla does. */ final static Pattern HTTP_SCHEME_SLASHES = Pattern.compile("^(https?://)/+(.*)"); /** * Pattern that looks for case of two or more slashes in a path. */ final static Pattern MULTIPLE_SLASHES = Pattern.compile("//+"); /** * System property key for list of supported schemes. */ private static final String SCHEMES_KEY = ".schemes"; /** * System property key for list of purposefully-ignored schemes. */ private static final String IGNORED_SCHEMES_KEY = ".ignored-schemes"; private String[] schemes = null; private String[] ignoredSchemes = null; public static final int IGNORED_SCHEME = 9999999; /** * Protected constructor. */ private UURIFactory() { super(); String s = System.getProperty(this.getClass().getName() + SCHEMES_KEY); if (s != null && s.length() > 0) { schemes = s.split("[, ]+"); Arrays.sort(schemes); } String ignored = System.getProperty(this.getClass().getName() + IGNORED_SCHEMES_KEY); if (ignored != null && ignored.length() > 0) { ignoredSchemes = ignored.split("[, ]+"); Arrays.sort(ignoredSchemes); } } /** * @param uri URI as string. * @return An instance of UURI * @throws URIException */ public static UURI getInstance(String uri) throws URIException { return UURIFactory.factory.create(uri); } /** * @param uri URI as string. * @param charset Character encoding of the passed uri string. * @return An instance of UURI * @throws URIException */ public static UURI getInstance(String uri, String charset) throws URIException { return UURIFactory.factory.create(uri, charset); } /** * @param base Base uri to use resolving passed relative uri. * @param relative URI as string. * @return An instance of UURI
?? 快捷鍵說明
復制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -