?? luceneindexlocaldisk.java
字號:
package Chapter12;
import java.io.IOException;
import java.io.File;
import java.io.FileReader;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.*;
import org.apache.lucene.analysis.standard.*;
import java.io.*;
import org.pdfbox.pdmodel.PDDocument;
import org.pdfbox.pdfparser.PDFParser;
import org.pdfbox.util.PDFTextStripper;
import org.pdfbox.searchengine.lucene.LucenePDFDocument;
import org.apache.poi.hwpf.extractor.*;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.model.*;
import org.apache.poi.hwpf.usermodel.*;
import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.NodeClassFilter;
import org.htmlparser.filters.TagNameFilter;
import org.htmlparser.nodes.TextNode;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
import org.htmlparser.visitors.TextExtractingVisitor;
/*******************************************************************
* 本代碼完成本地指定目錄的遍歷和文件查找。對指定后綴的文件進行分析,利用Lucene建立
* 索引,為后續檢索使用做好準備。
*******************************************************************/
public class LuceneIndexLocalDisk {
private static String Dest_Index_Path = "D:\\workshop\\alldata3";
//private static String Text_File_Path = "D:\\workshop\\ch12\\012\\";
private static String Text_File_Path = "D:\\科技部項目\\參考文獻資料\\";
//private static String Text_File_Path = "C:\\test\\";
/*========================================================
* 主函數,指定索引目錄和待分析的目錄,生成Lucene索引
*========================================================*/
public static void main(String[] args) {
File indexpath = new File(Dest_Index_Path);
File localPath = new File(Text_File_Path);
try {
int nums = indexBuilder(indexpath,localPath);
System.out.println("Index Finished " + nums + " docs");
} catch (IOException e) {
e.printStackTrace();
}
}
/*========================================================
* 索引創建函數,生成IndexWriter創建索引,調用子目錄索引函數,并優化
* 存儲本地磁盤索引
*========================================================*/
public static int indexBuilder( File indexPath , File localPath )
throws IOException{
if(!localPath.exists() || !localPath.isDirectory() || !localPath.canRead()){
throw new IOException(localPath + "不存在或者不允許訪問" );
}
System.out.println("目標路徑完好");
IndexWriter FSWriter = new IndexWriter(indexPath,new StandardAnalyzer(),true);
FSWriter.setUseCompoundFile(true);
SubindexBuilder(FSWriter,localPath);
int num = FSWriter.docCount();
FSWriter.optimize();
FSWriter.close();
return num;
}
/*========================================================
* 判斷當前文件名是否符合文件后綴要求
*========================================================*/
private static boolean IsValidType(String name){
if( name.endsWith(".txt") || name.endsWith(".html")
|| name.endsWith(".ini") ||name.endsWith(".conf")
|| name.endsWith(".pdf") ||name.endsWith(".doc"))
{
return true;
} else {
return false;
}
}
/*========================================================
* 處理各種不同類型文檔,調用相應的參數,合并到本地磁盤索引當中
*========================================================*/
private static void fileindexBuilder(IndexWriter fswriter,File subfile)
if( subfile.isHidden() || !subfile.exists() || !subfile.canRead()){
return ;
}
String strname = subfile.getName();
int dotpos = strname.indexOf(".");
if( (dotpos >0) && (dotpos < strname.length()))
{
String ext = strname.substring(dotpos + 1,strname.length());
if( ext.equalsIgnoreCase("pdf") )
Handlepdf(fswriter ,subfile);
else if( ext.equalsIgnoreCase("doc") )
Handledoc(fswriter ,subfile);
else if( ext.equalsIgnoreCase("xml") )
Handlexml(fswriter ,subfile);
else if( ext.equalsIgnoreCase("html") || ext.equalsIgnoreCase("htm") )
Handlehtml(fswriter ,subfile);
else if( IsValidType(strname))
Handletxt(fswriter ,subfile);;
}
}
/*========================================================
* 創建RAM內存索引,生成并添純文本文檔,合并到本地磁盤索引當中
*========================================================*/
private static void Handletxt (IndexWriter fswriter,File subPath)
{
// 處理分析PDF文檔,并索引文檔內容
try {
Directory ramdirectory = new RAMDirectory();
Analyzer TextAnalyzer = new StandardAnalyzer(); // 生成分析器
IndexWriter RAMWriter = new IndexWriter(ramdirectory ,TextAnalyzer ,true); // 生成索引器
RAMWriter.setUseCompoundFile(true); // 根據指定文件創建輸入流
FileInputStream instream = new FileInputStream(subPath);
// 由PDF文件生成文檔對象,包含contents字段
Document document = FileDocument.Document(subPath) ;
Field field_name = new Field("filename", subPath.getName(),
Field.Store.YES,Field.Index.UN_TOKENIZED); // 追加名字字段
document.add(field_name);
Field field_path = new Field("filepath", subPath.getAbsolutePath(),
Field.Store.YES,Field.Index.UN_TOKENIZED); // 追加路徑字段
document.add(field_path);
Field field_type = new Field("filetype","txt",
Field.Store.YES,Field.Index.UN_TOKENIZED); // 追加類型字段
document.add(field_type);
RAMWriter.addDocument(document); // 添加文檔到索引
RAMWriter.optimize();
RAMWriter.close(); // 索引完畢
fswriter.addIndexes(new Directory[]{ramdirectory});
}catch (IOException e) {
e.printStackTrace();
}
System.out.println("----------創建索引:Txt 文件成功. ----------");
}
/*========================================================
* 創建RAM內存索引,生成并添新文檔,合并到本地磁盤索引當中
*========================================================*/
private static void Handlepdf (IndexWriter fswriter,File subPath)
{
// 處理分析PDF文檔,并索引文檔內容
try {
Directory ramdirectory = new RAMDirectory();
Analyzer TextAnalyzer = new StandardAnalyzer(); // 生成分析器
IndexWriter RAMWriter = new IndexWriter(ramdirectory ,TextAnalyzer ,true); // 生成索引器
RAMWriter.setUseCompoundFile(true); // 根據指定文件創建輸入流
FileInputStream instream = new FileInputStream(subPath);
System.out.println("" + subPath );
int len = (int) subPath.length() + 1;
byte[] buffer = new byte[ len ] ;
instream.read(buffer);
// 由PDF文件生成文檔對象,包含contents字段
//Document document = LucenePDFDocument.getDocument( instream ) ;
Document document = new Document();
Field field_name = new Field("filename", subPath.getName(),
Field.Store.YES,Field.Index.UN_TOKENIZED); // 追加名字字段
document.add(field_name);
Field field_path = new Field("filepath", subPath.getAbsolutePath(),
Field.Store.YES,Field.Index.UN_TOKENIZED); // 追加路徑字段
?? 快捷鍵說明
復制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -