?? jphrase.java

?? AutoSummary uses Natural Language Processing to generate a contextually-relevant synopsis of plain t
?? JAVA
字號:
/*
 * This software is OSI Certified Open Source Software.
 * OSI Certified is a certification mark of the Open Source Initiative.
 *
 * This file is part of the AutoSummary package.
 * AutoSummary is licensed under the terms of the BSD License.
 *
 * Copyright (c) 2005, Charles F. Greenbacker III
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification,
 * are permitted provided that the following conditions are met:
 *
 *     * Redistributions of source code must retain the above copyright notice,
 *       this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright notice,
 *       this list of conditions and the following disclaimer in the documentation
 *       and/or other materials provided with the distribution.
 *     * Neither the name of AutoSummary nor the names of its contributors
 *       may be used to endorse or promote products derived from this software without
 *       specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT
 * SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
 * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

package net.artificialminds.AutoSummary;

import net.artificialminds.AutoSummary.FloatArrayComparator;
import net.artificialminds.JWords.JWord;
import net.artificialminds.JWords.JWords;
import net.artificialminds.JWords.JSense;
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Comparator;

/**
 * Development platform for word sense disambigulation methods. It creates a JPhrase object
 * containing JWords representing all the words in a given phrase. Statistical analysis can
 * then be performed using the lexigraphical information stored in the JWords, which allows
 * a program to determine the likely usage of each individual word in the phrase.
 * <p>
 * Note: When compiled with a JDK that supports generics (such as JDK 1.5), you will receive
 * notice that JPhrase.java uses unchecked or unsafe operations. This complaint wants
 * the ArrayLists in the code to use generics, but doing so breaks compatibility with
 * older JDKs (J2SE 1.4.2 SDK, etc), so generics are not used.
 *
 * @author                  Charlie Greenbacker
 * @version                 0.1.0a 20050816
 * @since                   AutoSummary 0.1.0a
 */
public class JPhrase
{
    /**
     * Stores source phrase provided as input.
     */
    private String phrase = null;

    /**
     * Stores JWords representing words in the given phrase. JWords are constructed after breaking down
     * input phrase into individual words.
     */
    private ArrayList wordList = null;

    /**
     * Holds scores for each part-of-speech type for every word in phrase. Scores are obtained from the
     * POS scores from all of the JWords in wordList. The last column contains the sum of all scores for
     * each word.
     */
    private float[][] posScores = null;

    /**
     * Contains valid parts-of-speech for each word in the phrase. Rows will vary in size depending on number
     * of valid parts-of-speech for the corresponding word. Only parts-of-speech with a posScore greater than
     * zero are considered 'valid,' meaning the given word can possibly be used as a particular POS. Entries
     * are integers representing parts-of-speech in the format: 0=noun, 1=verb, 2=adj, 3=adv.
     */
    private int[][] validPOS = null;

    /**
     * Holds every valid combination all using possible POS types for each word. Each combo has an integer in place
     * of each word, corresponding to the part-of-speech used (0=noun, 1=verb, 2=adj, 3=adv). The last column
     * contains the sum of POS scores for the specific parts-of-speech used for each word in that particular combo.
     */
    private float[][] posCombos = null;

    /**
     * Sole constructor. Creates a JPhrase object containing lexigraphical information about the words in the
     * given phrase upon which semantic analysis will be performed.
     *
     * @param inputPhrase   the input phrase to be analyzed
     * @since               AutoSummary 0.1.0a
     */
    public JPhrase(String inputPhrase)
    {
        phrase = inputPhrase; // capture input phrase

        String[] wordArray = null; // temporary storage of words used to create JWords & wordList

        JWord myJWord = null; // JWord used to store WordNet lexical information

        // value (out of 100) representing average statistical likeliness of each word appearing as the given POS
        float confidence = 0;

        // chop phrase into individual words, construct JWords out of each and add to wordList
        wordArray = phrase.split(" ");
        wordList = new ArrayList();
        for (int i=0; i<wordArray.length; i++)
            wordList.add(new JWord(wordArray[i]));
        wordList.trimToSize();

        // create posScores and populate with all POS scores for each word in phrase
        posScores = new float[wordList.size()][];
        for (int i=0; i<posScores.length; i++)
        {
            myJWord = (JWord)wordList.get(i);
            posScores[i] = new float[4];
            posScores[i][0] = myJWord.getNounScore();
            posScores[i][1] = myJWord.getVerbScore();
            posScores[i][2] = myJWord.getAdjScore();
            posScores[i][3] = myJWord.getAdvScore();
        }

        // create array of valid parts of speech for each word by looping through phrase and only grabbing
        // parts of speech with non-zero POS scores (0 for a noun, 1 for a verb, 2 for adj, 3 for adv); the
        // number of elements in each row corresponds to the number of valid parts-of-speech of each word
        validPOS = new int[posScores.length][];
        int valids = 0; // keeps track of valid POSs
        int position = 0; // keeps track of position
        for (int i=0; i<posScores.length; i++)
        {
            for (int j=0; j<4; j++)
            {
                if (posScores[i][j] != 0)
                    valids++;
            }
            validPOS[i] = new int[valids];
            for (int j=0; j<4; j++)
            {
                if (posScores[i][j] != 0)
                {
                    validPOS[i][position] = j;
                    position++;
                }
            }
            valids = 0;
            position = 0;
        }

        // calculate the total number of possible combinations using only valid POSs of each word
        int validCombos = 1; // number of valid combos, start at 1 because we use multiplicative incrementation
        for (int i=0; i<validPOS.length; i++)
            validCombos *= validPOS[i].length;

        // create all possible combinations using input from validPOS; last element in each row stores cumulative
        // POS score, all others a number corresponding to POS (0 = n, 1 = v, 2 = adv, 3 = adj)
        posCombos = new float[validCombos][posScores.length + 1];
        // keeps track of "pattern" when creating combinations,
        // starts at zero because we use multiplicative incrementation
        int offset = 1;
        int combo = 0; // keeps track of current combo
        for (int i=0; i<validPOS.length; i++)
        {
            combo = 0;
            while (combo < validCombos) // use each "pattern" over & over until array is filled
            {
                for (int j=0; j<validPOS[i].length; j++)
                {
                    for (int k=0; k<offset; k++)
                    {
                        if (combo < validCombos)
                        {
                            posCombos[combo][i] = validPOS[i][j];
                            posCombos[combo][posScores.length] += posScores[i][validPOS[i][j]];
                            combo++;
                        }
                    }
                }
            }
            offset *= validPOS[i].length; // changes "pattern" for each word
        }

        // sort in descending order based on total POS score of each combo
        Comparator FAComp = new FloatArrayComparator();
        Arrays.sort(posCombos, FAComp);
    }

    /**
     * Retrieves original source phrase.
     *
     * @return          original source phrase
     * @since           AutoSummary 0.1.0a
     */
    public String getPhrase()
    {
        return phrase;
    }

    /**
     * Retrieves list of JWords representing words in the source phrase.
     *
     * @return          the list of JWords representing words in the source phrase
     * @since           AutoSummary 0.1.0a
     */
    public ArrayList getWordList()
    {
        return wordList;
    }

    /**
     * Retrieves scores for each part-of-speech type for every word in phrase.
     *
     * @return          the scores for each part-of-speech type for every word in phrase
     * @since           AutoSummary 0.1.0a
     */
    public float[][] getPOSScores()
    {
        return posScores;
    }

    /**
     * Retrieves valid parts-of-speech for each word in the phrase.
     *
     * @return          valid parts-of-speech for each word in the phrase
     * @since           AutoSummary 0.1.0a
     */
    public int[][] getValidPOS()
    {
        return validPOS;
    }

    /**
     * Retrieves every valid combination using all possible POS types for each word.
     *
     * @return          every valid combination using all possible POS types for each word
     * @since           AutoSummary 0.1.0a
     */
    public float[][] getPOSCombos()
    {
        return posCombos;
    }

    /**
     * Retrieves most likely combo based on overall POS scores. Output resembles original source
     * phrase, except with the suggested part-of-speech in parentheses after each word.
     *
     * @return          most likely combo; each word tagged with suggested part-of-speech
     * @since           AutoSummary 0.1.0a
     */
    public String posTagger()
    {
        String taggedPhrase = ""; // stores output
        // loop through all words in highest scoring combo, and append POS identified as 0, 1, 2 or 3
        for (int i=0; i<(posCombos[0].length - 1); i++)
        {
            taggedPhrase += ((JWord)wordList.get(i)).getWord();
            switch ((int)posCombos[0][i])
            {
                case 0:
                    taggedPhrase += "(n) ";
                    break;
                case 1:
                    taggedPhrase += "(v) ";
                    break;
                case 2:
                    taggedPhrase += "(adj) ";
                    break;
                case 3:
                    taggedPhrase += "(adv) ";
                    break;
            }
        }
        return taggedPhrase;
    }

    /**
     * Calculates and returns level of confidence of most likely combo (from posTagger()), based on
     * average relative POS score of each word in combo. Output is a float value (out of 100)
     * representing average statistical likeliness of each word appearing as the POS determined by
     * posTagger().
     *
     * @return          level of confidence, as a percentage
     * @since           AutoSummary 0.1.0a
     */
    public float getConfidence()
    {
        float confidence = 0; // stores output
        for (int i=0; i<posScores.length; i++)
            confidence += ((posScores[i][(int)posCombos[0][i]]/(posScores[i][0] + posScores[i][1] + posScores[i][2] + posScores[i][3])) * 100);
        confidence = confidence / posScores.length;
        return confidence;
    }
}
?? 文件大小 49 K
?? 上傳用戶 liuhai
?? 所屬分類多國語言處理
??? 相關標簽

#contextually-relevant #AutoSummary #Processing #Language
?? 快捷鍵說明

復制代碼 Ctrl + C
搜索代碼 Ctrl + F
全屏模式 F11
切換主題 Ctrl + Shift + D
顯示快捷鍵 ?
增大字號 Ctrl + =
減小字號 Ctrl + -
亚洲欧美第一页_禁久久精品乱码_粉嫩av一区二区三区免费野_久草精品视频

?? jphrase.java

?? 快捷鍵說明