?? articleclassifierimpl.java
字號:
package article.service.impl;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;
import java.util.Map.Entry;
import net.paoding.analysis.analyzer.PaodingAnalyzer;
import org.apache.commons.lang.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Field.TermVector;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.TermFreqVector;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import article.entity.Article;
import article.entity.Category;
/**
* @author ahuaxuan(aaron zhang)
* @since 2008-2-18
* @version $Id$
*/
public class ArticleClassifierImpl {
private static transient Log logger = LogFactory.getLog(ArticleClassifierImpl.class);
private double vectorGene = 2;
public Map<String, List<String>> matchArticle(List<Category> categoryList, List<Article> articleList) {
try {
Map<String, Map<String, Integer>> classVector = getClassVector(categoryList);
Map<String, Map<String, Integer>> articleVector = getArticleVector(articleList);
return analyse(articleVector, classVector);
} catch (Exception e) {
// TODO Auto-generated catch block
logger.error("", e);
return Collections.emptyMap();
}
}
protected Map<String, List<String>> analyse(Map<String, Map<String, Integer>> articleVectorMap, Map<String, Map<String, Integer>> categoryVectorMap) {
Map<String, List<String>> map = new HashMap<String, List<String>>();
for (Entry<String, Map<String, Integer>> copyrightEntry : categoryVectorMap.entrySet()) {
List<String> itemIdList = new ArrayList<String>();
Map<String, String> tempMap = new HashMap<String, String>();
for (Entry<String, Map<String, Integer>> itemEntry : articleVectorMap.entrySet()) {
double acos = caculateVector(itemEntry.getValue(), filterVectorMap(copyrightEntry.getValue()));
if (acos < vectorGene) {
itemIdList.add(itemEntry.getKey());
tempMap.put(itemEntry.getKey(), String.valueOf(acos));
}
}
if (logger.isDebugEnabled()) {
logger.debug(new StringBuilder().append("++++++++++++ ").append("article vector informations of category which id is ")
.append(copyrightEntry.getKey()).append(" ++++++++"));
for (Entry<String, String> e : tempMap.entrySet()) {
logger.debug(new StringBuilder().append("articleId=").append(e.getKey())
.append("---------").append("acos value=").append(e.getValue()));
}
}
map.put(copyrightEntry.getKey(), itemIdList);
}
return map;
}
protected Map<String, Map<String, Integer>> getClassVector(List<Category> categoryList) throws Exception {
if (categoryList == null || categoryList.size() == 0) {
if (logger.isDebugEnabled()) {
logger.debug("The list of new categoryList which should be classified is null or size = 0");
}
return Collections.emptyMap();
}
Map<String, Map<String, Integer>> categoryMap = new HashMap<String, Map<String, Integer>>();
Directory ramDir = new RAMDirectory();
IndexWriter writer = new IndexWriter(ramDir, new PaodingAnalyzer(), true);
// IndexWriter writer = new IndexWriter(ramDir, new ChineseAnalyzer(), true);
for (Category cRc : categoryList) {
for (Article item : cRc.getArticleList()) {
Document doc = new Document();
doc.add(new Field("description", item.getContent(), Field.Store.NO,
Field.Index.TOKENIZED, TermVector.YES));
doc.add(new Field("category", cRc.getId().toString(), Field.Store.YES, Field.Index.NO));
writer.addDocument(doc);
}
}
if (logger.isDebugEnabled()) {
logger.debug("Generate the index in the memory, the size of categoryList list is " + categoryList.size());
}
writer.close();
buildContentVectors(ramDir, categoryMap, "category", "description");
return categoryMap;
}
protected Map<String, Map<String, Integer>> getArticleVector(List<Article> articleList) throws Exception {
if (articleList == null || articleList.size() == 0) {
if (logger.isDebugEnabled()) {
logger.debug("The list of articles which should be classified is null or size = 0");
}
}
Map<String, Map<String, Integer>> articleMap = new HashMap<String, Map<String, Integer>>();
Directory articleRamDir = new RAMDirectory();
// IndexWriter writer = new IndexWriter(articleRamDir, new ChineseAnalyzer(), true);
IndexWriter writer = new IndexWriter(articleRamDir, new PaodingAnalyzer(), true);
for (Article article : articleList) {
Document doc = new Document();
doc.add(new Field("articleId", article.getId(),
Field.Store.YES, Field.Index.NO));
doc.add(new Field("description", article.getText(), Field.Store.NO, Field.Index.TOKENIZED, TermVector.YES));
writer.addDocument(doc);
}
writer.flush();
writer.close();
buildContentVectors(articleRamDir, articleMap, "articleId", "description");
return articleMap;
}
protected void buildContentVectors(Directory ramDir, Map<String, Map<String, Integer>> contentMap, String key, String fieldName) throws CorruptIndexException, IOException {
IndexReader reader = IndexReader.open(ramDir);
int numDocs = reader.numDocs();
for (int k = 0; k < numDocs; k++) {
if (!reader.isDeleted(k)) {
Document doc = reader.document(k);
String category = doc.getField(key).stringValue();
Map<String, Integer> vectorMap = contentMap.get(category);
if (vectorMap == null) {
vectorMap = new TreeMap<String, Integer>();
contentMap.put(category, vectorMap);
}
TermFreqVector termFreqVector = reader.getTermFreqVector(k, fieldName);
if (termFreqVector == null) {
continue;
}
addTermFreqToMap(vectorMap, termFreqVector);
}
}
reader.close();
}
protected void addTermFreqToMap(Map<String, Integer> vectorMap, TermFreqVector termFreqv) {
String[] terms = termFreqv.getTerms();
int[] freqs = termFreqv.getTermFrequencies();
for (int i = 0; i < terms.length; i++) {
String term = terms[i];
if(vectorMap.containsKey(term)) {
Integer value = (Integer) vectorMap.get(term);
vectorMap.put(term, new Integer(value.intValue() + freqs[i]));
} else {
vectorMap.put(term, new Integer(freqs[i]));
}
}
}
protected Map<String, Integer> filterVectorMap(Map<String, Integer> map) {
Map<String, Integer> vectorMap = new HashMap<String, Integer>();
for (Entry<String, Integer> entry : map.entrySet()) {
if (entry.getValue() > 3 && !StringUtils.isNumeric(entry.getKey()) && entry.getKey().length() > 1) {
vectorMap.put(entry.getKey(), entry.getValue());
}
}
return vectorMap;
}
public double caculateVector(Map<String, Integer> articleVectorMap, Map<String, Integer> classVectorMap) {
if (articleVectorMap == null || classVectorMap == null) {
if (logger.isDebugEnabled()) {
logger.debug("itemVectorMap or classVectorMap is null");
}
return 20;
}
int dotItem = 0;
int sumOfSquares = 0;
int matchSize = 0;
for (Entry<String, Integer> entry : articleVectorMap.entrySet()) {
String word = entry.getKey();
double categoryWordFreq = 0;
if (classVectorMap.containsKey(word)) {
categoryWordFreq = classVectorMap.get(word).intValue();
++matchSize;
}
// dotItem += categoryWordFreq * Math.sqrt(entry.getValue());
dotItem += categoryWordFreq;
sumOfSquares += categoryWordFreq * categoryWordFreq;
}
double denominator;
if (sumOfSquares == articleVectorMap.size()) {
denominator = sumOfSquares;
} else {
denominator = Math.sqrt(sumOfSquares) * Math.sqrt(articleVectorMap.size());
}
double ratio = dotItem / denominator;
return Math.acos(ratio);
}
}
?? 快捷鍵說明
復(fù)制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -