?? htmlparser.jj
字號:
/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */// HTMLParser.jjoptions { STATIC = false; OPTIMIZE_TOKEN_MANAGER = true; //DEBUG_LOOKAHEAD = true; //DEBUG_TOKEN_MANAGER = true;}PARSER_BEGIN(HTMLParser)package org.apache.lucene.demo.html;import java.io.*;import java.util.Properties;public class HTMLParser { public static int SUMMARY_LENGTH = 200; StringBuffer title = new StringBuffer(SUMMARY_LENGTH); StringBuffer summary = new StringBuffer(SUMMARY_LENGTH * 2); Properties metaTags=new Properties(); String currentMetaTag=null; String currentMetaContent=null; int length = 0; boolean titleComplete = false; boolean inTitle = false; boolean inMetaTag = false; boolean inStyle = false; boolean afterTag = false; boolean afterSpace = false; String eol = System.getProperty("line.separator"); Reader pipeIn = null; Writer pipeOut; private MyPipedInputStream pipeInStream = null; private PipedOutputStream pipeOutStream = null; private class MyPipedInputStream extends PipedInputStream{ public MyPipedInputStream(){ super(); } public MyPipedInputStream(PipedOutputStream src) throws IOException{ super(src); } public boolean full() throws IOException{ return this.available() >= PipedInputStream.PIPE_SIZE; } } /** * @deprecated Use HTMLParser(FileInputStream) instead */ public HTMLParser(File file) throws FileNotFoundException { this(new FileInputStream(file)); } public String getTitle() throws IOException, InterruptedException { if (pipeIn == null) getReader(); // spawn parsing thread while (true) { synchronized(this) { if (titleComplete || pipeInStream.full()) break; wait(10); } } return title.toString().trim(); } public Properties getMetaTags() throws IOException,InterruptedException { if (pipeIn == null) getReader(); // spawn parsing thread while (true) { synchronized(this) { if (titleComplete || pipeInStream.full()) break; wait(10); } } return metaTags; } public String getSummary() throws IOException, InterruptedException { if (pipeIn == null) getReader(); // spawn parsing thread while (true) { synchronized(this) { if (summary.length() >= SUMMARY_LENGTH || pipeInStream.full()) break; wait(10); } } if (summary.length() > SUMMARY_LENGTH) summary.setLength(SUMMARY_LENGTH); String sum = summary.toString().trim(); String tit = getTitle(); if (sum.startsWith(tit) || sum.equals("")) return tit; else return sum; } public Reader getReader() throws IOException { if (pipeIn == null) { pipeInStream = new MyPipedInputStream(); pipeOutStream = new PipedOutputStream(pipeInStream); pipeIn = new InputStreamReader(pipeInStream, "UTF-16BE"); pipeOut = new OutputStreamWriter(pipeOutStream, "UTF-16BE"); Thread thread = new ParserThread(this); thread.start(); // start parsing } return pipeIn; } void addToSummary(String text) { if (summary.length() < SUMMARY_LENGTH) { summary.append(text); if (summary.length() >= SUMMARY_LENGTH) { synchronized(this) { notifyAll(); } } } } void addText(String text) throws IOException { if (inStyle) return; if (inTitle) title.append(text); else { addToSummary(text); if (!titleComplete && !title.equals("")) { // finished title synchronized(this) { titleComplete = true; // tell waiting threads notifyAll(); } } } length += text.length(); pipeOut.write(text); afterSpace = false; } void addMetaTag() { metaTags.setProperty(currentMetaTag, currentMetaContent); currentMetaTag = null; currentMetaContent = null; return; } void addSpace() throws IOException { if (!afterSpace) { if (inTitle) title.append(" "); else addToSummary(" "); String space = afterTag ? eol : " "; length += space.length(); pipeOut.write(space); afterSpace = true; } }// void handleException(Exception e) {// System.out.println(e.toString()); // print the error message// System.out.println("Skipping...");// Token t;// do {// t = getNextToken();// } while (t.kind != TagEnd);// }}PARSER_END(HTMLParser)void HTMLDocument() throws IOException :{ Token t;}{// try { ( Tag() { afterTag = true; } | t=Decl() { afterTag = true; } | CommentTag() { afterTag = true; } | ScriptTag() { afterTag = true; } | t=<Word> { addText(t.image); afterTag = false; } | t=<Entity> { addText(Entities.decode(t.image)); afterTag = false; } | t=<Punct> { addText(t.image); afterTag = false; } | <Space> { addSpace(); afterTag = false; } )* <EOF>// } catch (ParseException e) {// handleException(e);// }}void Tag() throws IOException :{ Token t1, t2; boolean inImg = false;}{ t1=<TagName> { String tagName = t1.image.toLowerCase(); if(Tags.WS_ELEMS.contains(tagName) ) { addSpace(); } inTitle = tagName.equalsIgnoreCase("<title"); // keep track if in <TITLE> inMetaTag = tagName.equalsIgnoreCase("<META"); // keep track if in <META> inStyle = tagName.equalsIgnoreCase("<STYLE"); // keep track if in <STYLE> inImg = tagName.equalsIgnoreCase("<img"); // keep track if in <IMG> } (t1=<ArgName> (<ArgEquals> (t2=ArgValue() // save ALT text in IMG tag { if (inImg && t1.image.equalsIgnoreCase("alt") && t2 != null) addText("[" + t2.image + "]"); if(inMetaTag && ( t1.image.equalsIgnoreCase("name") || t1.image.equalsIgnoreCase("HTTP-EQUIV") ) && t2 != null) { currentMetaTag=t2.image.toLowerCase(); if(currentMetaTag != null && currentMetaContent != null) { addMetaTag(); } } if(inMetaTag && t1.image.equalsIgnoreCase("content") && t2 !=null) { currentMetaContent=t2.image.toLowerCase(); if(currentMetaTag != null && currentMetaContent != null) { addMetaTag(); } } } )? )? )* <TagEnd>}Token ArgValue() :{ Token t = null;}{ t=<ArgValue> { return t; }| LOOKAHEAD(2) <ArgQuote1> <CloseQuote1> { return t; }| <ArgQuote1> t=<Quote1Text> <CloseQuote1> { return t; }| LOOKAHEAD(2) <ArgQuote2> <CloseQuote2> { return t; }| <ArgQuote2> t=<Quote2Text> <CloseQuote2> { return t; }}Token Decl() :{ Token t;}{ t=<DeclName> ( <ArgName> | ArgValue() | <ArgEquals> )* <TagEnd> { return t; }}void CommentTag() :{}{ (<Comment1> ( <CommentText1> )* <CommentEnd1>) | (<Comment2> ( <CommentText2> )* <CommentEnd2>)}void ScriptTag() :{}{ <ScriptStart> ( <ScriptText> )* <ScriptEnd>}TOKEN :{ < ScriptStart: "<script" > : WithinScript| < TagName: "<" ("/")? ["A"-"Z","a"-"z"] (<ArgName>)? > : WithinTag| < DeclName: "<" "!" ["A"-"Z","a"-"z"] (<ArgName>)? > : WithinTag| < Comment1: "<!--" > : WithinComment1| < Comment2: "<!" > : WithinComment2| < Word: ( <LET> | <LET> (["+","/"])+ | <NUM> ["\""] | <LET> ["-","'"] <LET> | ("$")? <NUM> [",","."] <NUM> )+ >| < #LET: ["A"-"Z","a"-"z","0"-"9"] >| < #NUM: ["0"-"9"] >| < #HEX: ["0"-"9","A"-"F","a"-"f"] >| < Entity: ( "&" (["A"-"Z","a"-"z"])+ (";")? | "&" "#" (<NUM>)+ (";")? | "&" "#" ["X","x"] (<HEX>)+ (";")? ) >| < Space: (<SP>)+ >| < #SP: [" ","\t","\r","\n"] >| < Punct: ~[] > // Keep this last. It is a catch-all.}<WithinScript> TOKEN:{ < ScriptText: (~["<",">"])+ | "<" | ">" >| < ScriptEnd: "</script" (~["<",">"])* ">" > : DEFAULT}<WithinTag> TOKEN:{ < ArgName: (~[" ","\t","\r","\n","=",">","'","\""]) (~[" ","\t","\r","\n","=",">"])* >| < ArgEquals: "=" > : AfterEquals| < TagEnd: ">" | "=>" > : DEFAULT}<AfterEquals> TOKEN:{ < ArgValue: (~[" ","\t","\r","\n","=",">","'","\""]) (~[" ","\t","\r","\n",">"])* > : WithinTag}<WithinTag, AfterEquals> TOKEN:{ < ArgQuote1: "'" > : WithinQuote1| < ArgQuote2: "\"" > : WithinQuote2}<WithinTag, AfterEquals> SKIP:{ < <Space> >}<WithinQuote1> TOKEN:{ < Quote1Text: (~["'"])+ >| < CloseQuote1: <ArgQuote1> > : WithinTag}<WithinQuote2> TOKEN:{ < Quote2Text: (~["\""])+ >| < CloseQuote2: <ArgQuote2> > : WithinTag}<WithinComment1> TOKEN :{ < CommentText1: (~["-"])+ | "-" >| < CommentEnd1: "-->" > : DEFAULT}<WithinComment2> TOKEN :{ < CommentText2: (~[">"])+ >| < CommentEnd2: ">" > : DEFAULT}
?? 快捷鍵說明
復(fù)制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -