?? (matlab)kmeans.m
字號:
function [idx, C, sumD, D] = kmeans(X, k, varargin)
%KMEANS K-means clustering.
% IDX = KMEANS(X, K) partitions the points in the N-by-P data matrix
% X into K clusters. This partition minimizes the sum, over all
% clusters, of the within-cluster sums of point-to-cluster-centroid
% distances. Rows of X correspond to points, columns correspond to
% variables. KMEANS returns an N-by-1 vector IDX containing the
% cluster indices of each point. By default, KMEANS uses squared
% Euclidean distances.
%
% [IDX, C] = KMEANS(X, K) returns the K cluster centroid locations in
% the K-by-P matrix C.
%
% [IDX, C, SUMD] = KMEANS(X, K) returns the within-cluster sums of
% point-to-centroid distances in the 1-by-K vector sumD.
%
% [IDX, C, SUMD, D] = KMEANS(X, K) returns distances from each point
% to every centroid in the N-by-K matrix D.
%
% [ ... ] = KMEANS(..., 'PARAM1',val1, 'PARAM2',val2, ...) allows you to
% specify optional parameter name/value pairs to control the iterative
% algorithm used by KMEANS. Parameters are:
%
% 'Distance' - Distance measure, in P-dimensional space, that KMEANS
% should minimize with respect to. Choices are:
% {'sqEuclidean'} - Squared Euclidean distance
% 'cityblock' - Sum of absolute differences, a.k.a. L1
% 'cosine' - One minus the cosine of the included angle
% between points (treated as vectors)
% 'correlation' - One minus the sample correlation between
% points (treated as sequences of values)
% 'Hamming' - Percentage of bits that differ (only
% suitable for binary data)
%
% 'Start' - Method used to choose initial cluster centroid positions,
% sometimes known as "seeds". Choices are:
% {'sample'} - Select K observations from X at random
% 'uniform' - Select K points uniformly at random from
% the range of X. Not valid for Hamming distance.
% 'cluster' - Perform preliminary clustering phase on
% random 10% subsample of X. This preliminary
% phase is itself initialized using 'sample'.
% matrix - A K-by-P matrix of starting locations. In
% this case, you can pass in [] for K, and
% KMEANS infers K from the first dimension of
% the matrix. You can also supply a 3D array,
% implying a value for 'Replicates'
% from the array's third dimension.
%
% 'Replicates' - Number of times to repeat the clustering, each with a
% new set of initial centroids [ positive integer | {1}]
%
% 'Maxiter' - The maximum number of iterations [ positive integer | {100}]
%
% 'EmptyAction' - Action to take if a cluster loses all of its member
% observations. Choices are:
% {'error'} - Treat an empty cluster as an error
% 'drop' - Remove any clusters that become empty, and
% set corresponding values in C and D to NaN.
% 'singleton' - Create a new cluster consisting of the one
% observation furthest from its centroid.
%
% 'Display' - Display level [ 'off' | {'notify'} | 'final' | 'iter' ]
%
% Example:
%
% X = [randn(20,2)+ones(20,2); randn(20,2)-ones(20,2)];
% [cidx, ctrs] = kmeans(X, 2, 'dist','city', 'rep',5, 'disp','final');
% plot(X(cidx==1,1),X(cidx==1,2),'r.', ...
% X(cidx==2,1),X(cidx==2,2),'b.', ctrs(:,1),ctrs(:,2),'kx');
%
% See also LINKAGE, CLUSTERDATA, SILHOUETTE.
% KMEANS uses a two-phase iterative algorithm to minimize the sum of
% point-to-centroid distances, summed over all K clusters. The first
% phase uses what the literature often describes as "batch" updates,
% where each iteration consists of reassigning points to their nearest
% cluster centroid, all at once, followed by recalculation of cluster
% centroids. This phase may be thought of as providing a fast but
% potentially only approximate solution as a starting point for the
% second phase. The second phase uses what the literature often
% describes as "on-line" updates, where points are individually
% reassigned if doing so will reduce the sum of distances, and cluster
% centroids are recomputed after each reassignment. Each iteration
% during this second phase consists of one pass though all the points.
% KMEANS can converge to a local optimum, which in this case is a
% partition of points in which moving any single point to a different
% cluster increases the total sum of distances. This problem can only be
% solved by a clever (or lucky, or exhaustive) choice of starting points.
%
% References:
%
% [1] Seber, G.A.F., Multivariate Observations, Wiley, New York, 1984.
% [2] Spath, H. (1985) Cluster Dissection and Analysis: Theory, FORTRAN
% Programs, Examples, translated by J. Goldschmidt, Halsted Press,
% New York, 226 pp.
% Copyright 1993-2000 The MathWorks, Inc.
% $Revision: 1.4 $ $Date: 2002/05/30 16:13:31 $
%
if nargin < 2
error('At least two input arguments required.');
end
% n points in p dimensional space
[n, p] = size(X);
Xsort = []; Xord = [];
pnames = { 'distance' 'start' 'replicates' 'maxiter' 'emptyaction' 'display'};
dflts = {'sqeuclidean' 'sample' [] 100 'error' 'notify'};
[errmsg,distance,start,reps,maxit,emptyact,display] ...
= statgetargs(pnames, dflts, varargin{:});
error(errmsg);
if ischar(distance)
distNames = {'sqeuclidean','cityblock','cosine','correlation','hamming'};
i = strmatch(lower(distance), distNames);
if length(i) > 1
error(sprintf('Ambiguous ''distance'' parameter value: %s.', distance));
elseif isempty(i)
error(sprintf('Unknown ''distance'' parameter value: %s.', distance));
end
distance = distNames{i};
switch distance
case 'cityblock'
[Xsort,Xord] = sort(X,1);
case 'cosine'
Xnorm = sqrt(sum(X.^2, 2));
if any(min(Xnorm) <= eps * max(Xnorm))
error(['Some points have small relative magnitudes, making them ', ...
'effectively zero.\nEither remove those points, or choose a ', ...
'distance other than ''cosine''.'], []);
end
X = X ./ Xnorm(:,ones(1,p));
case 'correlation'
X = X - repmat(mean(X,2),1,p);
Xnorm = sqrt(sum(X.^2, 2));
if any(min(Xnorm) <= eps * max(Xnorm))
error(['Some points have small relative standard deviations, making them ', ...
'effectively constant.\nEither remove those points, or choose a ', ...
'distance other than ''correlation''.'], []);
end
X = X ./ Xnorm(:,ones(1,p));
case 'hamming'
if ~all(ismember(X(:),[0 1]))
error('Non-binary data cannot be clustered using Hamming distance.');
end
end
else
error('The ''distance'' parameter value must be a string.');
end
if ischar(start)
startNames = {'uniform','sample','cluster'};
i = strmatch(lower(start), startNames);
if length(i) > 1
error(sprintf('Ambiguous ''start'' parameter value: %s.', start));
elseif isempty(i)
error(sprintf('Unknown ''start'' parameter value: %s.', start));
elseif isempty(k)
error('You must specify the number of clusters, K.');
end
start = startNames{i};
if strcmp(start, 'uniform')
if strcmp(distance, 'hamming')
error('Hamming distance cannot be initialized with uniform random values.');
end
Xmins = min(X,1);
Xmaxs = max(X,1);
end
elseif isnumeric(start)
CC = start;
start = 'numeric';
if isempty(k)
k = size(CC,1);
elseif k ~= size(CC,1);
error('The ''start'' matrix must have K rows.');
elseif size(CC,2) ~= p
error('The ''start'' matrix must have the same number of columns as X.');
end
if isempty(reps)
reps = size(CC,3);
elseif reps ~= size(CC,3);
error('The third dimension of the ''start'' array must match the ''replicates'' parameter value.');
end
% Need to center explicit starting points for 'correlation'. (Re)normalization
% for 'cosine'/'correlation' is done at each iteration.
if isequal(distance, 'correlation')
CC = CC - repmat(mean(CC,2),[1,p,1]);
end
else
error('The ''start'' parameter value must be a string or a numeric matrix or array.');
end
if ischar(emptyact)
emptyactNames = {'error','drop','singleton'};
i = strmatch(lower(emptyact), emptyactNames);
if length(i) > 1
error(sprintf('Ambiguous ''emptyaction'' parameter value: %s.', emptyact));
elseif isempty(i)
error(sprintf('Unknown ''emptyaction'' parameter value: %s.', emptyact));
end
emptyact = emptyactNames{i};
else
error('The ''emptyaction'' parameter value must be a string.');
end
if ischar(display)
i = strmatch(lower(display), strvcat('off','notify','final','iter'));
if length(i) > 1
error(sprintf('Ambiguous ''display'' parameter value: %s.', display));
elseif isempty(i)
error(sprintf('Unknown ''display'' parameter value: %s.', display));
end
display = i-1;
else
error('The ''display'' parameter value must be a string.');
end
if k == 1
error('The number of clusters must be greater than 1.');
elseif n < k
error('X must have more rows than the number of clusters.');
end
% Assume one replicate
if isempty(reps)
reps = 1;
end
%
% Done with input argument processing, begin clustering
%
dispfmt = '%6d\t%6d\t%8d\t%12g';
D = repmat(NaN,n,k); % point-to-cluster distances
Del = repmat(NaN,n,k); % reassignment criterion
m = zeros(k,1);
totsumDBest = Inf;
for rep = 1:reps
switch start
case 'uniform'
C = unifrnd(Xmins(ones(k,1),:), Xmaxs(ones(k,1),:));
% For 'cosine' and 'correlation', these are uniform inside a subset
% of the unit hypersphere. Still need to center them for
% 'correlation'. (Re)normalization for 'cosine'/'correlation' is
% done at each iteration.
if isequal(distance, 'correlation')
C = C - repmat(mean(C,2),1,p);
end
case 'sample'
C = double(X(randsample(n,k),:)); % X may be logical
case 'cluster'
Xsubset = X(randsample(n,floor(.1*n)),:);
[dum, C] = kmeans(Xsubset, k, varargin{:}, 'start','sample', 'replicates',1);
case 'numeric'
C = CC(:,:,rep);
end
changed = 1:k; % everything is newly assigned
idx = zeros(n,1);
totsumD = Inf;
if display > 2 % 'iter'
disp(sprintf(' iter\t phase\t num\t sum'));
end
%
% Begin phase one: batch reassignments
%
converged = false;
iter = 0;
while true
% Compute the distance from every point to each cluster centroid
D(:,changed) = distfun(X, C(changed,:), distance, iter);
% Compute the total sum of distances for the current configuration.
% Can't do it first time through, there's no configuration yet.
if iter > 0
totsumD = sum(D((idx-1)*n + (1:n)'));
% Test for a cycle: if objective is not decreased, back out
% the last step and move on to the single update phase
if prevtotsumD <= totsumD
idx = previdx;
[C(changed,:), m(changed)] = gcentroids(X, idx, changed, distance, Xsort, Xord);
iter = iter - 1;
break;
end
if display > 2 % 'iter'
disp(sprintf(dispfmt,iter,1,length(moved),totsumD));
end
if iter >= maxit, break; end
end
% Determine closest cluster for each point and reassign points to clusters
previdx = idx;
prevtotsumD = totsumD;
[d, nidx] = min(D, [], 2);
if iter == 0
% Every point moved, every cluster will need an update
moved = 1:n;
idx = nidx;
changed = 1:k;
else
% Determine which points moved
moved = find(nidx ~= previdx);
if length(moved) > 0
% Resolve ties in favor of not moving
moved = moved(D((previdx(moved)-1)*n + moved) > d(moved));
end
if length(moved) == 0
break;
end
idx(moved) = nidx(moved);
% Find clusters that gained or lost members
changed = unique([idx(moved); previdx(moved)])';
end
% Calculate the new cluster centroids and counts.
[C(changed,:), m(changed)] = gcentroids(X, idx, changed, distance, Xsort, Xord);
iter = iter + 1;
?? 快捷鍵說明
復制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -