?? jphrase.java
字號:
/*
* This software is OSI Certified Open Source Software.
* OSI Certified is a certification mark of the Open Source Initiative.
*
* This file is part of the AutoSummary package.
* AutoSummary is licensed under the terms of the BSD License.
*
* Copyright (c) 2005, Charles F. Greenbacker III
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
* * Neither the name of AutoSummary nor the names of its contributors
* may be used to endorse or promote products derived from this software without
* specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT
* SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
* OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
package net.artificialminds.AutoSummary;
import net.artificialminds.AutoSummary.FloatArrayComparator;
import net.artificialminds.JWords.JWord;
import net.artificialminds.JWords.JWords;
import net.artificialminds.JWords.JSense;
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Comparator;
/**
* Development platform for word sense disambigulation methods. It creates a JPhrase object
* containing JWords representing all the words in a given phrase. Statistical analysis can
* then be performed using the lexigraphical information stored in the JWords, which allows
* a program to determine the likely usage of each individual word in the phrase.
* <p>
* Note: When compiled with a JDK that supports generics (such as JDK 1.5), you will receive
* notice that JPhrase.java uses unchecked or unsafe operations. This complaint wants
* the ArrayLists in the code to use generics, but doing so breaks compatibility with
* older JDKs (J2SE 1.4.2 SDK, etc), so generics are not used.
*
* @author Charlie Greenbacker
* @version 0.1.0a 20050816
* @since AutoSummary 0.1.0a
*/
public class JPhrase
{
/**
* Stores source phrase provided as input.
*/
private String phrase = null;
/**
* Stores JWords representing words in the given phrase. JWords are constructed after breaking down
* input phrase into individual words.
*/
private ArrayList wordList = null;
/**
* Holds scores for each part-of-speech type for every word in phrase. Scores are obtained from the
* POS scores from all of the JWords in wordList. The last column contains the sum of all scores for
* each word.
*/
private float[][] posScores = null;
/**
* Contains valid parts-of-speech for each word in the phrase. Rows will vary in size depending on number
* of valid parts-of-speech for the corresponding word. Only parts-of-speech with a posScore greater than
* zero are considered 'valid,' meaning the given word can possibly be used as a particular POS. Entries
* are integers representing parts-of-speech in the format: 0=noun, 1=verb, 2=adj, 3=adv.
*/
private int[][] validPOS = null;
/**
* Holds every valid combination all using possible POS types for each word. Each combo has an integer in place
* of each word, corresponding to the part-of-speech used (0=noun, 1=verb, 2=adj, 3=adv). The last column
* contains the sum of POS scores for the specific parts-of-speech used for each word in that particular combo.
*/
private float[][] posCombos = null;
/**
* Sole constructor. Creates a JPhrase object containing lexigraphical information about the words in the
* given phrase upon which semantic analysis will be performed.
*
* @param inputPhrase the input phrase to be analyzed
* @since AutoSummary 0.1.0a
*/
public JPhrase(String inputPhrase)
{
phrase = inputPhrase; // capture input phrase
String[] wordArray = null; // temporary storage of words used to create JWords & wordList
JWord myJWord = null; // JWord used to store WordNet lexical information
// value (out of 100) representing average statistical likeliness of each word appearing as the given POS
float confidence = 0;
// chop phrase into individual words, construct JWords out of each and add to wordList
wordArray = phrase.split(" ");
wordList = new ArrayList();
for (int i=0; i<wordArray.length; i++)
wordList.add(new JWord(wordArray[i]));
wordList.trimToSize();
// create posScores and populate with all POS scores for each word in phrase
posScores = new float[wordList.size()][];
for (int i=0; i<posScores.length; i++)
{
myJWord = (JWord)wordList.get(i);
posScores[i] = new float[4];
posScores[i][0] = myJWord.getNounScore();
posScores[i][1] = myJWord.getVerbScore();
posScores[i][2] = myJWord.getAdjScore();
posScores[i][3] = myJWord.getAdvScore();
}
// create array of valid parts of speech for each word by looping through phrase and only grabbing
// parts of speech with non-zero POS scores (0 for a noun, 1 for a verb, 2 for adj, 3 for adv); the
// number of elements in each row corresponds to the number of valid parts-of-speech of each word
validPOS = new int[posScores.length][];
int valids = 0; // keeps track of valid POSs
int position = 0; // keeps track of position
for (int i=0; i<posScores.length; i++)
{
for (int j=0; j<4; j++)
{
if (posScores[i][j] != 0)
valids++;
}
validPOS[i] = new int[valids];
for (int j=0; j<4; j++)
{
if (posScores[i][j] != 0)
{
validPOS[i][position] = j;
position++;
}
}
valids = 0;
position = 0;
}
// calculate the total number of possible combinations using only valid POSs of each word
int validCombos = 1; // number of valid combos, start at 1 because we use multiplicative incrementation
for (int i=0; i<validPOS.length; i++)
validCombos *= validPOS[i].length;
// create all possible combinations using input from validPOS; last element in each row stores cumulative
// POS score, all others a number corresponding to POS (0 = n, 1 = v, 2 = adv, 3 = adj)
posCombos = new float[validCombos][posScores.length + 1];
// keeps track of "pattern" when creating combinations,
// starts at zero because we use multiplicative incrementation
int offset = 1;
int combo = 0; // keeps track of current combo
for (int i=0; i<validPOS.length; i++)
{
combo = 0;
while (combo < validCombos) // use each "pattern" over & over until array is filled
{
for (int j=0; j<validPOS[i].length; j++)
{
for (int k=0; k<offset; k++)
{
if (combo < validCombos)
{
posCombos[combo][i] = validPOS[i][j];
posCombos[combo][posScores.length] += posScores[i][validPOS[i][j]];
combo++;
}
}
}
}
offset *= validPOS[i].length; // changes "pattern" for each word
}
// sort in descending order based on total POS score of each combo
Comparator FAComp = new FloatArrayComparator();
Arrays.sort(posCombos, FAComp);
}
/**
* Retrieves original source phrase.
*
* @return original source phrase
* @since AutoSummary 0.1.0a
*/
public String getPhrase()
{
return phrase;
}
/**
* Retrieves list of JWords representing words in the source phrase.
*
* @return the list of JWords representing words in the source phrase
* @since AutoSummary 0.1.0a
*/
public ArrayList getWordList()
{
return wordList;
}
/**
* Retrieves scores for each part-of-speech type for every word in phrase.
*
* @return the scores for each part-of-speech type for every word in phrase
* @since AutoSummary 0.1.0a
*/
public float[][] getPOSScores()
{
return posScores;
}
/**
* Retrieves valid parts-of-speech for each word in the phrase.
*
* @return valid parts-of-speech for each word in the phrase
* @since AutoSummary 0.1.0a
*/
public int[][] getValidPOS()
{
return validPOS;
}
/**
* Retrieves every valid combination using all possible POS types for each word.
*
* @return every valid combination using all possible POS types for each word
* @since AutoSummary 0.1.0a
*/
public float[][] getPOSCombos()
{
return posCombos;
}
/**
* Retrieves most likely combo based on overall POS scores. Output resembles original source
* phrase, except with the suggested part-of-speech in parentheses after each word.
*
* @return most likely combo; each word tagged with suggested part-of-speech
* @since AutoSummary 0.1.0a
*/
public String posTagger()
{
String taggedPhrase = ""; // stores output
// loop through all words in highest scoring combo, and append POS identified as 0, 1, 2 or 3
for (int i=0; i<(posCombos[0].length - 1); i++)
{
taggedPhrase += ((JWord)wordList.get(i)).getWord();
switch ((int)posCombos[0][i])
{
case 0:
taggedPhrase += "(n) ";
break;
case 1:
taggedPhrase += "(v) ";
break;
case 2:
taggedPhrase += "(adj) ";
break;
case 3:
taggedPhrase += "(adv) ";
break;
}
}
return taggedPhrase;
}
/**
* Calculates and returns level of confidence of most likely combo (from posTagger()), based on
* average relative POS score of each word in combo. Output is a float value (out of 100)
* representing average statistical likeliness of each word appearing as the POS determined by
* posTagger().
*
* @return level of confidence, as a percentage
* @since AutoSummary 0.1.0a
*/
public float getConfidence()
{
float confidence = 0; // stores output
for (int i=0; i<posScores.length; i++)
confidence += ((posScores[i][(int)posCombos[0][i]]/(posScores[i][0] + posScores[i][1] + posScores[i][2] + posScores[i][3])) * 100);
confidence = confidence / posScores.length;
return confidence;
}
}
?? 快捷鍵說明
復制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -