?? apriori.java
字號:
/* * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. *//* * Apriori.java * Copyright (C) 1999 Eibe Frank,Mark Hall * */package weka.associations;import java.io.*;import java.util.*;import weka.core.*;import weka.filters.Filter;import weka.filters.AttributeFilter;/** * Class implementing an Apriori-type algorithm. Iteratively reduces the minimum * support until it finds the required number of rules with the given minimum * confidence. <p> * * Reference: R. Agrawal, R. Srikant (1994). <i>Fast algorithms for * mining association rules in large databases </i>. Proc * International Conference on Very Large Databases, * pp. 478-499. Santiage, Chile: Morgan Kaufmann, Los Altos, CA. <p> * * Valid options are:<p> * * -N required number of rules <br> * The required number of rules (default: 10). <p> * * -T type of metric by which to sort rules <br> * 0 = confidence | 1 = lift | 2 = leverage | 3 = Conviction. <p> * * -C minimum confidence of a rule <br> * The minimum confidence of a rule (default: 0.9). <p> * * -D delta for minimum support <br> * The delta by which the minimum support is decreased in * each iteration (default: 0.05). <p> * * -U upper bound for minimum support <br> * The upper bound for minimum support. Don't explicitly look for * rules with more than this level of support. <p> * * -M lower bound for minimum support <br> * The lower bound for the minimum support (default = 0.1). <p> * * -S significance level <br> * If used, rules are tested for significance at * the given level. Slower (default = no significance testing). <p> * * -R <br> * If set then columns that contain all missing values are removed from * the data. * * -I <br> * If set the itemsets found are also output (default = no). <p> * * @author Eibe Frank (eibe@cs.waikato.ac.nz) * @author Mark Hall (mhall@cs.waikato.ac.nz) * @version $Revision: 1.11 $ */public class Apriori extends Associator implements OptionHandler { /** The minimum support. */ protected double m_minSupport; /** The upper bound on the support */ protected double m_upperBoundMinSupport; /** The lower bound for the minimum support. */ protected double m_lowerBoundMinSupport; /** Metric types. */ protected static final int CONFIDENCE = 0; protected static final int LIFT = 1; protected static final int LEVERAGE = 2; protected static final int CONVICTION = 3; public static final Tag [] TAGS_SELECTION = { new Tag(CONFIDENCE, "Confidence"), new Tag(LIFT, "Lift"), new Tag(LEVERAGE, "Leverage"), new Tag(CONVICTION, "Conviction") }; /** The selected metric type. */ protected int m_metricType = CONFIDENCE; /** The minimum metric score. */ protected double m_minMetric; /** The maximum number of rules that are output. */ protected int m_numRules; /** Delta by which m_minSupport is decreased in each iteration. */ protected double m_delta; /** Significance level for optional significance test. */ protected double m_significanceLevel; /** Number of cycles used before required number of rules was one. */ protected int m_cycles; /** The set of all sets of itemsets L. */ protected FastVector m_Ls; /** The same information stored in hash tables. */ protected FastVector m_hashtables; /** The list of all generated rules. */ protected FastVector[] m_allTheRules; /** The instances (transactions) to be used for generating the association rules. */ protected Instances m_instances; /** Output itemsets found? */ protected boolean m_outputItemSets; protected boolean m_removeMissingCols; /** Report progress iteratively */ protected boolean m_verbose; /** * Returns a string describing this associator * @return a description of the evaluator suitable for * displaying in the explorer/experimenter gui */ public String globalInfo() { return "Finds association rules."; } /** * Constructor that allows to sets default values for the * minimum confidence and the maximum number of rules * the minimum confidence. */ public Apriori() { resetOptions(); } /** * Resets the options to the default values. */ public void resetOptions() { m_removeMissingCols = false; m_verbose = false; m_delta = 0.05; m_minMetric = 0.90; m_numRules = 10; m_lowerBoundMinSupport = 0.1; m_upperBoundMinSupport = 1.0; m_significanceLevel = -1; m_outputItemSets = false; } /** * Removes columns that are all missing from the data * @param instances the instances * @return a new set of instances with all missing columns removed */ private Instances removeMissingColumns(Instances instances) throws Exception { int numInstances = instances.numInstances(); StringBuffer deleteString = new StringBuffer(); int removeCount = 0; boolean first = true; int maxCount = 0; for (int i=0;i<instances.numAttributes();i++) { AttributeStats as = instances.attributeStats(i); if (m_upperBoundMinSupport == 1.0 && maxCount != numInstances) { // see if we can decrease this by looking for the most frequent value int [] counts = as.nominalCounts; if (counts[Utils.maxIndex(counts)] > maxCount) { maxCount = counts[Utils.maxIndex(counts)]; } } if (as.missingCount == numInstances) { if (first) { deleteString.append((i+1)); first = false; } else { deleteString.append(","+(i+1)); } removeCount++; } } if (m_verbose) { System.err.println("Removed : "+removeCount+" columns with all missing " +"values."); } if (m_upperBoundMinSupport == 1.0 && maxCount != numInstances) { m_upperBoundMinSupport = (double)maxCount / (double)numInstances; if (m_verbose) { System.err.println("Setting upper bound min support to : " +m_upperBoundMinSupport); } } if (deleteString.toString().length() > 0) { AttributeFilter af = new AttributeFilter(); af.setAttributeIndices(deleteString.toString()); af.setInvertSelection(false); af.setInputFormat(instances); Instances newInst = Filter.useFilter(instances, af); return newInst; } return instances; } /** * Method that generates all large itemsets with a minimum support, and from * these all association rules with a minimum confidence. * * @param instances the instances to be used for generating the associations * @exception Exception if rules can't be built successfully */ public void buildAssociations(Instances instances) throws Exception { double[] confidences, supports; int[] indices; FastVector[] sortedRuleSet; int necSupport=0; if (instances.checkForStringAttributes()) { throw new Exception("Can't handle string attributes!"); } if (m_removeMissingCols) { instances = removeMissingColumns(instances); } // Decrease minimum support until desired number of rules found. m_cycles = 0; m_minSupport = m_upperBoundMinSupport - m_delta; m_minSupport = (m_minSupport < m_lowerBoundMinSupport) ? m_lowerBoundMinSupport : m_minSupport; do { // Reserve space for variables m_Ls = new FastVector(); m_hashtables = new FastVector(); m_allTheRules = new FastVector[6]; m_allTheRules[0] = new FastVector(); m_allTheRules[1] = new FastVector(); m_allTheRules[2] = new FastVector(); if (m_metricType != CONFIDENCE || m_significanceLevel != -1) { m_allTheRules[3] = new FastVector(); m_allTheRules[4] = new FastVector(); m_allTheRules[5] = new FastVector(); } sortedRuleSet = new FastVector[6]; sortedRuleSet[0] = new FastVector(); sortedRuleSet[1] = new FastVector(); sortedRuleSet[2] = new FastVector(); if (m_metricType != CONFIDENCE || m_significanceLevel != -1) { sortedRuleSet[3] = new FastVector(); sortedRuleSet[4] = new FastVector(); sortedRuleSet[5] = new FastVector(); } // Find large itemsets and rules findLargeItemSets(instances); if (m_significanceLevel != -1 || m_metricType != CONFIDENCE) findRulesBruteForce(); else findRulesQuickly(); // Sort rules according to their support supports = new double[m_allTheRules[2].size()]; for (int i = 0; i < m_allTheRules[2].size(); i++) supports[i] = (double)((ItemSet)m_allTheRules[1].elementAt(i)).support(); indices = Utils.stableSort(supports); for (int i = 0; i < m_allTheRules[2].size(); i++) { sortedRuleSet[0].addElement(m_allTheRules[0].elementAt(indices[i])); sortedRuleSet[1].addElement(m_allTheRules[1].elementAt(indices[i])); sortedRuleSet[2].addElement(m_allTheRules[2].elementAt(indices[i])); if (m_metricType != CONFIDENCE || m_significanceLevel != -1) { sortedRuleSet[3].addElement(m_allTheRules[3].elementAt(indices[i])); sortedRuleSet[4].addElement(m_allTheRules[4].elementAt(indices[i])); sortedRuleSet[5].addElement(m_allTheRules[5].elementAt(indices[i])); } } // Sort rules according to their confidence m_allTheRules[0].removeAllElements(); m_allTheRules[1].removeAllElements(); m_allTheRules[2].removeAllElements(); if (m_metricType != CONFIDENCE || m_significanceLevel != -1) { m_allTheRules[3].removeAllElements(); m_allTheRules[4].removeAllElements(); m_allTheRules[5].removeAllElements(); } confidences = new double[sortedRuleSet[2].size()]; int sortType = 2 + m_metricType; for (int i = 0; i < sortedRuleSet[2].size(); i++) confidences[i] = ((Double)sortedRuleSet[sortType].elementAt(i)).doubleValue(); indices = Utils.stableSort(confidences); for (int i = sortedRuleSet[0].size() - 1; (i >= (sortedRuleSet[0].size() - m_numRules)) && (i >= 0); i--) { m_allTheRules[0].addElement(sortedRuleSet[0].elementAt(indices[i])); m_allTheRules[1].addElement(sortedRuleSet[1].elementAt(indices[i])); m_allTheRules[2].addElement(sortedRuleSet[2].elementAt(indices[i])); if (m_metricType != CONFIDENCE || m_significanceLevel != -1) { m_allTheRules[3].addElement(sortedRuleSet[3].elementAt(indices[i])); m_allTheRules[4].addElement(sortedRuleSet[4].elementAt(indices[i])); m_allTheRules[5].addElement(sortedRuleSet[5].elementAt(indices[i]));
?? 快捷鍵說明
復制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -