?? kmeans.java
字號:
/**
*
* AgentAcademy - an open source Data Mining framework for
* training intelligent agents
*
* Copyright (C) 2001-2003 AA Consortium.
*
* This library is open source software; you can redistribute it
* and/or modify it under the terms of the GNU Lesser General
* Public License as published by the Free Software Foundation;
* either version 2.0 of the License, or (at your option) any later
* version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free
* Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
* MA 02111-1307 USA
*
*/
package org.agentacademy.modules.dataminer.clusterers;
/**
* <p>Title: The Data Miner prototype</p>
* <p>Description: A prototype for the DataMiner (DM), the Agent Academy (AA) module responsible for performing data mining on the contents of the Agent Use Repository (AUR). The extracted knowledge is to be sent back to the AUR in the form of a PMML document.</p>
* <p>Copyright: Copyright (c) 2002</p>
* <p>Company: CERTH</p>
* @author asymeon
* @version 0.3
*/
import java.io.*;
import java.util.*;
import org.agentacademy.modules.dataminer.core.*;
import org.agentacademy.modules.dataminer.filters.Filter;
import org.agentacademy.modules.dataminer.filters.ReplaceMissingValuesFilter;
import org.jdom.*;
import org.jdom.output.*;
import org.apache.log4j.Logger;
/**
* Simple k means clustering class.
*
* Valid options are:<p>
*
* -N <number of clusters> <br>
* Specify the number of clusters to generate. <p>
*
* -S <seed> <br>
* Specify random number seed. <p>
*
*/
public class KMeans extends Clusterer implements OptionHandler {
public static Logger log = Logger.getLogger(KMeans.class);
/** The pmmlDocument Document */
public static Document pmmlDocument = null;
/*
* training instances
*/
private Instances m_instances;
/**
* replace missing values in training instances
*/
private ReplaceMissingValuesFilter m_ReplaceMissingFilter;
/**
* number of clusters to generate
*/
private int m_NumClusters = 2;
/**
* holds the cluster centroids
*/
private Instances m_ClusterCentroids;
/**
* temporary variable holding cluster assignments while iterating
*/
private int [] m_ClusterAssignments;
/**
* random seed
*/
private int m_Seed = 10;
/**
* attribute min values
*/
private double [] m_Min;
/**
* attribute max values
*/
private double [] m_Max;
/**
* Keep track of the number of iterations completed before convergence
*/
private int m_Iterations = 0;
/**
* Returns a string describing this clusterer
* @return a description of the evaluator suitable for
* displaying in the explorer/experimenter gui
*/
public String globalInfo() {
return "Cluster data using the k means algorithm";
}
/**
* Generates a clusterer. Has to initialize all fields of the clusterer
* that are not being set via options.
*
* @param data set of instances serving as training data
* @exception Exception if the clusterer has not been
* generated successfully
*/
public void buildClusterer(Instances data) throws Exception {
m_Iterations = 0;
if (data.checkForStringAttributes()) {
throw new Exception("Can't handle string attributes!");
}
m_ReplaceMissingFilter = new ReplaceMissingValuesFilter();
m_ReplaceMissingFilter.setInputFormat(data);
m_instances = Filter.useFilter(data, m_ReplaceMissingFilter);
m_Min = new double [m_instances.numAttributes()];
m_Max = new double [m_instances.numAttributes()];
for (int i = 0; i < m_instances.numAttributes(); i++) {
m_Min[i] = m_Max[i] = Double.NaN;
}
for (int i = 0; i < m_instances.numInstances(); i++) {
updateMinMax(m_instances.instance(i));
}
m_ClusterCentroids = new Instances(m_instances, m_NumClusters);
m_ClusterAssignments = new int [m_instances.numInstances()];
Random RandomO = new Random(m_Seed);
boolean [] selected = new boolean[m_instances.numInstances()];
int instIndex;
for (int i = 0; i < m_NumClusters; i++) {
do {
instIndex = Math.abs(RandomO.nextInt()) %
m_instances.numInstances();
} while (selected[instIndex]);
m_ClusterCentroids.add(m_instances.instance(instIndex));
selected[instIndex] = true;
}
selected = null;
boolean converged = false;
while (!converged) {
m_Iterations++;
converged = true;
for (int i = 0; i < m_instances.numInstances(); i++) {
Instance toCluster = m_instances.instance(i);
int newC = clusterProcessedInstance(toCluster);
if (newC != m_ClusterAssignments[i]) {
converged = false;
}
m_ClusterAssignments[i] = newC;
// System.out.println(newC);
}
Instances [] tempI = new Instances[m_NumClusters];
// update centroids
m_ClusterCentroids = new Instances(m_instances, m_NumClusters);
for (int i = 0; i < m_NumClusters; i++) {
tempI[i] = new Instances(m_instances, 0);
}
for (int i = 0; i < m_instances.numInstances(); i++) {
tempI[m_ClusterAssignments[i]].add(m_instances.instance(i));
}
for (int i = 0; i < m_NumClusters; i++) {
double [] vals = new double[m_instances.numAttributes()];
for (int j = 0; j < m_instances.numAttributes(); j++) {
vals[j] = tempI[i].meanOrMode(j);
}
m_ClusterCentroids.add(new Instance(1.0, vals));
}
}
}
/**
* clusters an instance that has been through the filters
*
* @param instance the instance to assign a cluster to
* @return a cluster number
*/
private int clusterProcessedInstance(Instance instance) {
double minDist = Integer.MAX_VALUE;
int bestCluster = 0;
for (int i = 0; i < m_NumClusters; i++) {
double dist = distance(instance, m_ClusterCentroids.instance(i));
if (dist < minDist) {
minDist = dist;
bestCluster = i;
}
}
return bestCluster;
}
/**
* Classifies a given instance.
*
* @param instance the instance to be assigned to a cluster
* @return the number of the assigned cluster as an interger
* if the class is enumerated, otherwise the predicted value
* @exception Exception if instance could not be classified
* successfully
*/
public int clusterInstance(Instance instance) throws Exception {
m_ReplaceMissingFilter.input(instance);
m_ReplaceMissingFilter.batchFinished();
Instance inst = m_ReplaceMissingFilter.output();
return clusterProcessedInstance(inst);
}
/**
* Calculates the distance between two instances
*
* @param test the first instance
* @param train the second instance
* @return the distance between the two given instances, between 0 and 1
*/
private double distance(Instance first, Instance second) {
double distance = 0;
int firstI, secondI;
for (int p1 = 0, p2 = 0;
p1 < first.numValues() || p2 < second.numValues();) {
if (p1 >= first.numValues()) {
firstI = m_instances.numAttributes();
} else {
firstI = first.index(p1);
}
if (p2 >= second.numValues()) {
secondI = m_instances.numAttributes();
} else {
secondI = second.index(p2);
}
if (firstI == m_instances.classIndex()) {
p1++; continue;
}
if (secondI == m_instances.classIndex()) {
p2++; continue;
}
double diff;
if (firstI == secondI) {
diff = difference(firstI,
first.valueSparse(p1),
second.valueSparse(p2));
p1++; p2++;
} else if (firstI > secondI) {
diff = difference(secondI,
0, second.valueSparse(p2));
p2++;
} else {
diff = difference(firstI,
first.valueSparse(p1), 0);
p1++;
}
distance += diff * diff;
}
return Math.sqrt(distance / m_instances.numAttributes());
}
/**
* Computes the difference between two given attribute
* values.
*/
private double difference(int index, double val1, double val2) {
switch (m_instances.attribute(index).type()) {
case org.agentacademy.modules.dataminer.core.Attribute.NOMINAL:
// If attribute is nominal
if (Instance.isMissingValue(val1) ||
Instance.isMissingValue(val2) ||
((int)val1 != (int)val2)) {
return 1;
} else {
return 0;
}
case org.agentacademy.modules.dataminer.core.Attribute.NUMERIC:
// If attribute is numeric
if (Instance.isMissingValue(val1) ||
Instance.isMissingValue(val2)) {
if (Instance.isMissingValue(val1) &&
Instance.isMissingValue(val2)) {
return 1;
} else {
double diff;
if (Instance.isMissingValue(val2)) {
diff = norm(val1, index);
} else {
diff = norm(val2, index);
}
if (diff < 0.5) {
diff = 1.0 - diff;
}
return diff;
}
} else {
return norm(val1, index) - norm(val2, index);
}
default:
return 0;
}
}
/**
* Normalizes a given value of a numeric attribute.
*
* @param x the value to be normalized
* @param i the attribute's index
*/
private double norm(double x, int i) {
if (Double.isNaN(m_Min[i]) || Utils.eq(m_Max[i],m_Min[i])) {
return 0;
} else {
return (x - m_Min[i]) / (m_Max[i] - m_Min[i]);
}
}
/**
* Updates the minimum and maximum values for all the attributes
* based on a new instance.
*
* @param instance the new instance
*/
private void updateMinMax(Instance instance) {
for (int j = 0;j < m_instances.numAttributes(); j++) {
if (!instance.isMissing(j)) {
if (Double.isNaN(m_Min[j])) {
m_Min[j] = instance.value(j);
m_Max[j] = instance.value(j);
} else {
if (instance.value(j) < m_Min[j]) {
m_Min[j] = instance.value(j);
} else {
if (instance.value(j) > m_Max[j]) {
m_Max[j] = instance.value(j);
}
}
}
}
}
}
/**
?? 快捷鍵說明
復制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -