?? luceneindexlocaldisk.java

?? Lucene+nuctch一書的全部源碼測試源碼和幾個簡單的項目
?? JAVA
?? 第 1 頁 / 共 2 頁
字號:
12 下一頁
package Chapter12;

import java.io.IOException;
import java.io.File;
import java.io.FileReader;

import org.apache.lucene.store.Directory; 
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.*;
import org.apache.lucene.analysis.standard.*;
import java.io.*;

import org.pdfbox.pdmodel.PDDocument;
import org.pdfbox.pdfparser.PDFParser;
import org.pdfbox.util.PDFTextStripper;
import org.pdfbox.searchengine.lucene.LucenePDFDocument;

import org.apache.poi.hwpf.extractor.*;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.model.*;
import org.apache.poi.hwpf.usermodel.*;
import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.NodeClassFilter;
import org.htmlparser.filters.TagNameFilter;
import org.htmlparser.nodes.TextNode;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
import org.htmlparser.visitors.TextExtractingVisitor;

/*******************************************************************
 * 本代碼完成本地指定目錄的遍歷和文件查找。對指定后綴的文件進行分析，利用Lucene建立
 * 索引，為后續檢索使用做好準備。
 *******************************************************************/
public class LuceneIndexLocalDisk {

	private static String Dest_Index_Path = "D:\\workshop\\alldata3";
	//private static String Text_File_Path  = "D:\\workshop\\ch12\\012\\";
	private static String Text_File_Path  = "D:\\科技部項目\\參考文獻資料\\";
	//private static String Text_File_Path  = "C:\\test\\";
	
	/*========================================================
	 * 主函數，指定索引目錄和待分析的目錄，生成Lucene索引
	 *========================================================*/
	public static void main(String[] args) {
		
		File indexpath = new File(Dest_Index_Path);
		File localPath = new File(Text_File_Path);
		
		try {
			int nums = indexBuilder(indexpath,localPath);
			System.out.println("Index Finished " + nums + "  docs");			
		} catch (IOException e) {
			e.printStackTrace();
		}
	}
	/*========================================================
	 * 索引創建函數，生成IndexWriter創建索引，調用子目錄索引函數，并優化
	 * 存儲本地磁盤索引
	 *========================================================*/
	public static int indexBuilder( File indexPath , File localPath ) 
	throws IOException{
		if(!localPath.exists() || !localPath.isDirectory() || !localPath.canRead()){
			throw new IOException(localPath + "不存在或者不允許訪問" );
		}
		System.out.println("目標路徑完好");		
		IndexWriter FSWriter = new IndexWriter(indexPath,new StandardAnalyzer(),true);
		FSWriter.setUseCompoundFile(true);

		SubindexBuilder(FSWriter,localPath);
		int num =  FSWriter.docCount();
		FSWriter.optimize();
		FSWriter.close();
		return num;
	}
	
	/*========================================================
	 * 判斷當前文件名是否符合文件后綴要求
	 *========================================================*/
	private static boolean IsValidType(String name){
			if(    name.endsWith(".txt") || name.endsWith(".html")
				|| name.endsWith(".ini") ||name.endsWith(".conf")
				|| name.endsWith(".pdf") ||name.endsWith(".doc"))
			{
				return true;
			} else {
				return false;
			}
	}
	/*========================================================
	 * 處理各種不同類型文檔,調用相應的參數，合并到本地磁盤索引當中
	 *========================================================*/
	private static void  fileindexBuilder(IndexWriter fswriter,File subfile)  
	throws IOException{
	
		if( subfile.isHidden() || !subfile.exists() || !subfile.canRead()){
			return ;
		}
        String strname = subfile.getName();
        int dotpos = strname.indexOf(".");
         if( (dotpos >0) && (dotpos < strname.length()))
         {
            String ext = strname.substring(dotpos + 1,strname.length());
            if( ext.equalsIgnoreCase("pdf") )
                 Handlepdf(fswriter ,subfile);
            else if( ext.equalsIgnoreCase("doc") )
                 Handledoc(fswriter ,subfile);
            else if( ext.equalsIgnoreCase("xml") )
                 Handlexml(fswriter ,subfile);
            else if( ext.equalsIgnoreCase("html") || ext.equalsIgnoreCase("htm") )
                Handlehtml(fswriter ,subfile);
            else if( IsValidType(strname))
            	Handletxt(fswriter ,subfile);;
         }
	}
	/*========================================================
	 * 創建RAM內存索引，生成并添純文本文檔，合并到本地磁盤索引當中
	 *========================================================*/	
	private static void  Handletxt (IndexWriter fswriter,File subPath)
	{

		// 處理分析PDF文檔，并索引文檔內容
		try {
			Directory ramdirectory = new RAMDirectory();
			Analyzer TextAnalyzer = new StandardAnalyzer();             // 生成分析器
			IndexWriter RAMWriter = new IndexWriter(ramdirectory ,TextAnalyzer ,true); // 生成索引器
			
			RAMWriter.setUseCompoundFile(true);                        // 根據指定文件創建輸入流

			FileInputStream instream = new FileInputStream(subPath); 
		
			// 由PDF文件生成文檔對象，包含contents字段
			Document document = FileDocument.Document(subPath) ; 

			Field field_name = new Field("filename", subPath.getName(),   
					Field.Store.YES,Field.Index.UN_TOKENIZED);         // 追加名字字段
			document.add(field_name);

			Field field_path = new Field("filepath", subPath.getAbsolutePath(), 
					Field.Store.YES,Field.Index.UN_TOKENIZED);         // 追加路徑字段
			document.add(field_path);
			
			Field field_type = new Field("filetype","txt",   
					Field.Store.YES,Field.Index.UN_TOKENIZED);         // 追加類型字段
			document.add(field_type);
			
			RAMWriter.addDocument(document);                          // 添加文檔到索引
			RAMWriter.optimize();
			RAMWriter.close();                                        // 索引完畢
			fswriter.addIndexes(new Directory[]{ramdirectory});
			
		  }catch (IOException e) {
			e.printStackTrace();
		}
		System.out.println("----------創建索引：Txt 文件成功. ----------");
}			
	
	/*========================================================
	 * 創建RAM內存索引，生成并添新文檔，合并到本地磁盤索引當中
	 *========================================================*/
	private static void  Handlepdf (IndexWriter fswriter,File subPath)
	{
			// 處理分析PDF文檔，并索引文檔內容
			try {
				Directory ramdirectory = new RAMDirectory();
				Analyzer TextAnalyzer = new StandardAnalyzer();            // 生成分析器
				IndexWriter RAMWriter = new IndexWriter(ramdirectory ,TextAnalyzer ,true); // 生成索引器
				
				RAMWriter.setUseCompoundFile(true);                        // 根據指定文件創建輸入流

				FileInputStream instream = new FileInputStream(subPath); 
			
				System.out.println("" + subPath );
				
				int len = (int) subPath.length() + 1;
				byte[] buffer = new byte[ len ] ;
				instream.read(buffer);
				
				
				// 由PDF文件生成文檔對象，包含contents字段
				//Document document = LucenePDFDocument.getDocument( instream ) ;
				Document document = new Document();
				Field field_name = new Field("filename", subPath.getName(),   
						Field.Store.YES,Field.Index.UN_TOKENIZED);         // 追加名字字段
				document.add(field_name);
				
				Field field_path = new Field("filepath", subPath.getAbsolutePath(), 
						Field.Store.YES,Field.Index.UN_TOKENIZED);         // 追加路徑字段
12 下一頁
?? 文件大小 22461 K
?? 上傳用戶 cnnotes
?? 所屬分類 Java編程
??? 相關標簽

#Lucene #nuctch #源碼 #測試
?? 快捷鍵說明

復制代碼 Ctrl + C
搜索代碼 Ctrl + F
全屏模式 F11
切換主題 Ctrl + Shift + D
顯示快捷鍵 ?
增大字號 Ctrl + =
減小字號 Ctrl + -
亚洲欧美第一页_禁久久精品乱码_粉嫩av一区二区三区免费野_久草精品视频

?? luceneindexlocaldisk.java

?? 快捷鍵說明