?? birchcluster.java
字號:
/* * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. *//* * BIRCHCluster.java * Copyright (C) 2001 University of Waikato, Hamilton, New Zealand * */package weka.datagenerators.clusterers;import weka.core.Attribute;import weka.core.FastVector;import weka.core.Instance;import weka.core.Instances;import weka.core.Option;import weka.core.SelectedTag;import weka.core.Tag;import weka.core.TechnicalInformation;import weka.core.TechnicalInformationHandler;import weka.core.Utils;import weka.core.TechnicalInformation.Field;import weka.core.TechnicalInformation.Type;import weka.datagenerators.ClusterGenerator;import java.io.Serializable;import java.util.Enumeration;import java.util.Random;import java.util.Vector;/** <!-- globalinfo-start --> * Cluster data generator designed for the BIRCH System<br/> * <br/> * Dataset is generated with instances in K clusters.<br/> * Instances are 2-d data points.<br/> * Each cluster is characterized by the number of data points in itits radius and its center. The location of the cluster centers isdetermined by the pattern parameter. Three patterns are currentlysupported grid, sine and random.<br/> * <br/> * For more information refer to:<br/> * <br/> * Tian Zhang, Raghu Ramakrishnan, Miron Livny: BIRCH: An Efficient Data Clustering Method for Very Large Databases. In: ACM SIGMOD International Conference on Management of Data, 103-114, 1996. * <p/> <!-- globalinfo-end --> * <!-- technical-bibtex-start --> * BibTeX: * <pre> * @inproceedings{Zhang1996, * author = {Tian Zhang and Raghu Ramakrishnan and Miron Livny}, * booktitle = {ACM SIGMOD International Conference on Management of Data}, * pages = {103-114}, * publisher = {ACM Press}, * title = {BIRCH: An Efficient Data Clustering Method for Very Large Databases}, * year = {1996} * } * </pre> * <p/> <!-- technical-bibtex-end --> * <!-- options-start --> * Valid options are: <p/> * * <pre> -h * Prints this help.</pre> * * <pre> -o <file> * The name of the output file, otherwise the generated data is * printed to stdout.</pre> * * <pre> -r <name> * The name of the relation.</pre> * * <pre> -d * Whether to print debug informations.</pre> * * <pre> -S * The seed for random function (default 1)</pre> * * <pre> -a <num> * The number of attributes (default 10).</pre> * * <pre> -c * Class Flag, if set, the cluster is listed in extra attribute.</pre> * * <pre> -b <range> * The indices for boolean attributes.</pre> * * <pre> -m <range> * The indices for nominal attributes.</pre> * * <pre> -k <num> * The number of clusters (default 4)</pre> * * <pre> -G * Set pattern to grid (default is random). * This flag cannot be used at the same time as flag I. * The pattern is random, if neither flag G nor flag I is set.</pre> * * <pre> -I * Set pattern to sine (default is random). * This flag cannot be used at the same time as flag I. * The pattern is random, if neither flag G nor flag I is set.</pre> * * <pre> -N <num>..<num> * The range of number of instances per cluster (default 1..50). * Lower number must be between 0 and 2500, * upper number must be between 50 and 2500.</pre> * * <pre> -R <num>..<num> * The range of radius per cluster (default 0.1..1.4142135623730951). * Lower number must be between 0 and SQRT(2), * upper number must be between SQRT(2) and SQRT(32).</pre> * * <pre> -M <num> * The distance multiplier (default 4.0).</pre> * * <pre> -C <num> * The number of cycles (default 4).</pre> * * <pre> -O * Flag for input order is ORDERED. If flag is not set then * input order is RANDOMIZED. RANDOMIZED is currently not * implemented, therefore is the input order always ORDERED.</pre> * * <pre> -P <num> * The noise rate in percent (default 0.0). * Can be between 0% and 30%. (Remark: The original * algorithm only allows noise up to 10%.)</pre> * <!-- options-end --> * * @author Gabi Schmidberger (gabi@cs.waikato.ac.nz) * @author FracPete (fracpete at waikato dot ac dot nz) * @version $Revision: 1.7 $ */public class BIRCHCluster extends ClusterGenerator implements TechnicalInformationHandler { /** for serialization */ static final long serialVersionUID = -334820527230755027L; /** Number of Clusters the dataset should have */ protected int m_NumClusters; /** minimal number of instances per cluster (option N)*/ private int m_MinInstNum; /** maximal number of instances per cluster (option N)*/ private int m_MaxInstNum; /** minimum radius (option R)*/ private double m_MinRadius; /** maximum radius (option R)*/ private double m_MaxRadius; /** Constant set for choice of pattern. (option G)*/ public static final int GRID = 0; /** Constant set for choice of pattern. (option I)*/ public static final int SINE = 1; /** Constant set for choice of pattern. (default)*/ public static final int RANDOM = 2; /** the pattern tags */ public static final Tag[] TAGS_PATTERN = { new Tag(GRID, "Grid"), new Tag(SINE, "Sine"), new Tag(RANDOM, "Random") }; /** pattern (changed with options G or S)*/ private int m_Pattern; /** distance multiplier (option M)*/ private double m_DistMult; /** number of cycles (option C)*/ private int m_NumCycles; /** Constant set for input order (option O)*/ public static final int ORDERED = 0; /** Constant set for input order (default)*/ public static final int RANDOMIZED = 1; /** the input order tags */ public static final Tag[] TAGS_INPUTORDER = { new Tag(ORDERED, "ordered"), new Tag(RANDOMIZED, "randomized") }; /** input order (changed with option O)*/ private int m_InputOrder; /** noise rate in percent (option P, between 0 and 30)*/ private double m_NoiseRate; /** cluster list */ private FastVector m_ClusterList; // following are used for pattern is GRID /** grid size*/ private int m_GridSize; /** grid width*/ private double m_GridWidth; /** * class to represent cluster */ private class Cluster implements Serializable { /** for serialization */ static final long serialVersionUID = -8336901069823498140L; /** number of instances for this cluster */ private int m_InstNum; /** radius of cluster * variance is radius ** 2 / 2 */ private double m_Radius; /** center of cluster = array of Double values */ private double[] m_Center; /** * Constructor, used for pattern = RANDOM * * @param instNum the number of instances * @param radius radius of the cluster * @param random the random number generator to use */ private Cluster(int instNum, double radius, Random random) { m_InstNum = instNum; m_Radius = radius; m_Center = new double[getNumAttributes()]; for (int i = 0; i < getNumAttributes(); i++) { m_Center[i] = random.nextDouble() * (double) m_NumClusters; } } /** * Constructor, used for pattern = GRID * * @param instNum the number of instances * @param radius radius of the cluster * @param gridVector vector for grid positions * @param gridWidth factor for grid position */ // center is defined in the constructor of cluster private Cluster(int instNum, double radius, int[] gridVector, double gridWidth) { m_InstNum = instNum; m_Radius = radius; m_Center = new double[getNumAttributes()]; for (int i = 0; i < getNumAttributes(); i++) { m_Center[i] = ((double) gridVector[i] + 1.0) * gridWidth; } } /** * returns the number of instances * * @return the number of instances */ private int getInstNum() { return m_InstNum; } /** * returns the radius * * @return the radius */ private double getRadius() { return m_Radius; } /** * returns the variance * * @return the variance */ private double getVariance() { return Math.pow(m_Radius, 2.0) / 2.0; } /** * returns the standard deviation * * @return the standard deviation */ private double getStdDev() { return (m_Radius / Math.pow(2.0, 0.5)); } /** * returns the centers * * @return the centers */ private double[] getCenter() { return m_Center; } /** * returns the center value for a given dimension * * @param dimension the dimension to return the center for * @return the center value for the given dimension * @throws Exception if dimension invalid */ private double getCenterValue(int dimension) throws Exception { if (dimension >= m_Center.length) throw new Exception("Current system has only " + m_Center.length + " dimensions."); return m_Center[dimension]; } } // end class Cluster /** * class to represent Vector for placement of the center in space */ private class GridVector implements Serializable { /** for serialization */ static final long serialVersionUID = -1900309948991039522L; /** array of integer */ private int[] m_GridVector; /** one higher then the highest possible integer value * in any of the integers in the gridvector */ private int m_Base; /** size of vector */ private int m_Size; /** * Constructor * * @param numDim number of dimensions = number of attributes * @param base is one higher then the highest possible integer value * in any of the integers in the gridvector */ private GridVector(int numDim, int base) { m_Size = numDim; m_Base = base; m_GridVector = new int [numDim]; for (int i = 0; i < numDim; i++) m_GridVector[i] = 0; } /** * returns the integer array * * @return the integer array */ private int[] getGridVector() { return m_GridVector; } /** * Overflow has occurred when integer is zero. * *@param digit the input integer *@return true if digit is 0 */ private boolean overflow(int digit) { return (digit == 0); } /** * Adds one to integer and sets to zero, if new value was * equal m_Base. * *@param digit the input integer *@return new integer object */ private int addOne(int digit) { int value = digit + 1; if (value >= m_Base) value = 0; return value; } /** * add 1 to vector */ private void addOne() { m_GridVector[0] = addOne(m_GridVector[0]); int i = 1; while (overflow(m_GridVector[i - 1]) && i < m_Size) { m_GridVector[i] = addOne(m_GridVector[i]); i++; } } } // end class GridVector /** * initializes the generator with default values */ public BIRCHCluster() { super(); setNumClusters(defaultNumClusters()); setMinInstNum(defaultMinInstNum()); setMaxInstNum(defaultMaxInstNum()); setMinRadius(defaultMinRadius()); setMaxRadius(defaultMaxRadius()); setPattern(defaultPattern()); setDistMult(defaultDistMult()); setNumCycles(defaultNumCycles()); setInputOrder(defaultInputOrder()); setNoiseRate(defaultNoiseRate()); } /** * Returns a string describing this data generator. * * @return a description of the data generator suitable for * displaying in the explorer/experimenter gui */ public String globalInfo() { return "Cluster data generator designed for the BIRCH System\n\n" + "Dataset is generated with instances in K clusters.\n" + "Instances are 2-d data points.\n" + "Each cluster is characterized by the number of data points in it" + "its radius and its center. The location of the cluster centers is" + "determined by the pattern parameter. Three patterns are currently" + "supported grid, sine and random.\n\n" + "For more information refer to:\n\n" + getTechnicalInformation().toString(); } /** * Returns an instance of a TechnicalInformation object, containing * detailed information about the technical background of this class, * e.g., paper reference or book this class is based on. * * @return the technical information about this class */ public TechnicalInformation getTechnicalInformation() { TechnicalInformation result; result = new TechnicalInformation(Type.INPROCEEDINGS); result.setValue(Field.AUTHOR, "Tian Zhang and Raghu Ramakrishnan and Miron Livny"); result.setValue(Field.TITLE, "BIRCH: An Efficient Data Clustering Method for Very Large Databases"); result.setValue(Field.BOOKTITLE, "ACM SIGMOD International Conference on Management of Data"); result.setValue(Field.YEAR, "1996"); result.setValue(Field.PAGES, "103-114"); result.setValue(Field.PUBLISHER, "ACM Press"); return result; } /** * Returns an enumeration describing the available options. * * @return an enumeration of all the available options */ public Enumeration listOptions() { Vector result = enumToVector(super.listOptions()); result.addElement(new Option( "\tThe number of clusters (default " + defaultNumClusters() + ")", "k", 1, "-k <num>")); result.addElement(new Option( "\tSet pattern to grid (default is random).\n" + "\tThis flag cannot be used at the same time as flag I.\n" + "\tThe pattern is random, if neither flag G nor flag I is set.", "G", 0, "-G")); result.addElement(new Option( "\tSet pattern to sine (default is random).\n" + "\tThis flag cannot be used at the same time as flag I.\n" + "\tThe pattern is random, if neither flag G nor flag I is set.", "I", 0, "-I")); result.addElement(new Option(
?? 快捷鍵說明
復制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -