?? htmlparser.java
字號:
/* Generated By:JavaCC: Do not edit this line. HTMLParser.java */package org.apache.lucene.demo.html;import java.io.*;import java.util.Properties;public class HTMLParser implements HTMLParserConstants { public static int SUMMARY_LENGTH = 200; StringBuffer title = new StringBuffer(SUMMARY_LENGTH); StringBuffer summary = new StringBuffer(SUMMARY_LENGTH * 2); Properties metaTags=new Properties(); String currentMetaTag=null; String currentMetaContent=null; int length = 0; boolean titleComplete = false; boolean inTitle = false; boolean inMetaTag = false; boolean inStyle = false; boolean afterTag = false; boolean afterSpace = false; String eol = System.getProperty("line.separator"); Reader pipeIn = null; Writer pipeOut; private MyPipedInputStream pipeInStream = null; private PipedOutputStream pipeOutStream = null; private class MyPipedInputStream extends PipedInputStream{ public MyPipedInputStream(){ super(); } public MyPipedInputStream(PipedOutputStream src) throws IOException{ super(src); } public boolean full() throws IOException{ return this.available() >= PipedInputStream.PIPE_SIZE; } } /** * @deprecated Use HTMLParser(FileInputStream) instead */ public HTMLParser(File file) throws FileNotFoundException { this(new FileInputStream(file)); } public String getTitle() throws IOException, InterruptedException { if (pipeIn == null) getReader(); // spawn parsing thread while (true) { synchronized(this) { if (titleComplete || pipeInStream.full()) break; wait(10); } } return title.toString().trim(); } public Properties getMetaTags() throws IOException,InterruptedException { if (pipeIn == null) getReader(); // spawn parsing thread while (true) { synchronized(this) { if (titleComplete || pipeInStream.full()) break; wait(10); } } return metaTags; } public String getSummary() throws IOException, InterruptedException { if (pipeIn == null) getReader(); // spawn parsing thread while (true) { synchronized(this) { if (summary.length() >= SUMMARY_LENGTH || pipeInStream.full()) break; wait(10); } } if (summary.length() > SUMMARY_LENGTH) summary.setLength(SUMMARY_LENGTH); String sum = summary.toString().trim(); String tit = getTitle(); if (sum.startsWith(tit) || sum.equals("")) return tit; else return sum; } public Reader getReader() throws IOException { if (pipeIn == null) { pipeInStream = new MyPipedInputStream(); pipeOutStream = new PipedOutputStream(pipeInStream); pipeIn = new InputStreamReader(pipeInStream, "UTF-16BE"); pipeOut = new OutputStreamWriter(pipeOutStream, "UTF-16BE"); Thread thread = new ParserThread(this); thread.start(); // start parsing } return pipeIn; } void addToSummary(String text) { if (summary.length() < SUMMARY_LENGTH) { summary.append(text); if (summary.length() >= SUMMARY_LENGTH) { synchronized(this) { notifyAll(); } } } } void addText(String text) throws IOException { if (inStyle) return; if (inTitle) title.append(text); else { addToSummary(text); if (!titleComplete && !title.equals("")) { // finished title synchronized(this) { titleComplete = true; // tell waiting threads notifyAll(); } } } length += text.length(); pipeOut.write(text); afterSpace = false; } void addMetaTag() { metaTags.setProperty(currentMetaTag, currentMetaContent); currentMetaTag = null; currentMetaContent = null; return; } void addSpace() throws IOException { if (!afterSpace) { if (inTitle) title.append(" "); else addToSummary(" "); String space = afterTag ? eol : " "; length += space.length(); pipeOut.write(space); afterSpace = true; } } final public void HTMLDocument() throws ParseException, IOException { Token t; label_1: while (true) { switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { case ScriptStart: case TagName: case DeclName: case Comment1: case Comment2: case Word: case Entity: case Space: case Punct: ; break; default: jj_la1[0] = jj_gen; break label_1; } switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { case TagName: Tag(); afterTag = true; break; case DeclName: t = Decl(); afterTag = true; break; case Comment1: case Comment2: CommentTag(); afterTag = true; break; case ScriptStart: ScriptTag(); afterTag = true; break; case Word: t = jj_consume_token(Word); addText(t.image); afterTag = false; break; case Entity: t = jj_consume_token(Entity); addText(Entities.decode(t.image)); afterTag = false; break; case Punct: t = jj_consume_token(Punct); addText(t.image); afterTag = false; break; case Space: jj_consume_token(Space); addSpace(); afterTag = false; break; default: jj_la1[1] = jj_gen; jj_consume_token(-1); throw new ParseException(); } } jj_consume_token(0); } final public void Tag() throws ParseException, IOException { Token t1, t2; boolean inImg = false; t1 = jj_consume_token(TagName); String tagName = t1.image.toLowerCase(); if(Tags.WS_ELEMS.contains(tagName) ) { addSpace(); } inTitle = tagName.equalsIgnoreCase("<title"); // keep track if in <TITLE> inMetaTag = tagName.equalsIgnoreCase("<META"); // keep track if in <META> inStyle = tagName.equalsIgnoreCase("<STYLE"); // keep track if in <STYLE> inImg = tagName.equalsIgnoreCase("<img"); // keep track if in <IMG> label_2: while (true) { switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { case ArgName: ; break; default: jj_la1[2] = jj_gen; break label_2; } t1 = jj_consume_token(ArgName); switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { case ArgEquals: jj_consume_token(ArgEquals); switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { case ArgValue: case ArgQuote1: case ArgQuote2: t2 = ArgValue(); if (inImg && t1.image.equalsIgnoreCase("alt") && t2 != null) addText("[" + t2.image + "]"); if(inMetaTag && ( t1.image.equalsIgnoreCase("name") || t1.image.equalsIgnoreCase("HTTP-EQUIV") ) && t2 != null) { currentMetaTag=t2.image.toLowerCase(); if(currentMetaTag != null && currentMetaContent != null) { addMetaTag(); } } if(inMetaTag && t1.image.equalsIgnoreCase("content") && t2 !=null) { currentMetaContent=t2.image.toLowerCase(); if(currentMetaTag != null && currentMetaContent != null) { addMetaTag(); } } break; default: jj_la1[3] = jj_gen; ; } break; default: jj_la1[4] = jj_gen; ; } } jj_consume_token(TagEnd); } final public Token ArgValue() throws ParseException { Token t = null; switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { case ArgValue: t = jj_consume_token(ArgValue); {if (true) return t;} break; default: jj_la1[5] = jj_gen; if (jj_2_1(2)) { jj_consume_token(ArgQuote1); jj_consume_token(CloseQuote1); {if (true) return t;} } else { switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { case ArgQuote1: jj_consume_token(ArgQuote1); t = jj_consume_token(Quote1Text); jj_consume_token(CloseQuote1); {if (true) return t;} break; default: jj_la1[6] = jj_gen; if (jj_2_2(2)) { jj_consume_token(ArgQuote2); jj_consume_token(CloseQuote2); {if (true) return t;} } else { switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { case ArgQuote2: jj_consume_token(ArgQuote2); t = jj_consume_token(Quote2Text); jj_consume_token(CloseQuote2); {if (true) return t;} break; default: jj_la1[7] = jj_gen; jj_consume_token(-1); throw new ParseException(); } } } } } throw new Error("Missing return statement in function"); } final public Token Decl() throws ParseException { Token t; t = jj_consume_token(DeclName); label_3: while (true) { switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { case ArgName: case ArgEquals: case ArgValue: case ArgQuote1: case ArgQuote2: ; break; default: jj_la1[8] = jj_gen; break label_3; } switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { case ArgName: jj_consume_token(ArgName); break; case ArgValue: case ArgQuote1: case ArgQuote2: ArgValue(); break; case ArgEquals: jj_consume_token(ArgEquals); break; default:
?? 快捷鍵說明
復制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -