?? kmeans.m
字號:
function converged = batchUpdate
% Every point moved, every cluster will need an update
moved = 1:n;
changed = 1:k;
previdx = zeros(n,1);
prevtotsumD = Inf;
if display > 2 % 'iter'
disp(sprintf(' iter\t phase\t num\t sum'));
end
%
% Begin phase one: batch reassignments
%
iter = 0;
converged = false;
while true
iter = iter + 1;
% Calculate the new cluster centroids and counts, and update the
% distance from every point to those new cluster centroids
[C(changed,:), m(changed)] = gcentroids(X, idx, changed, distance);
D(:,changed) = distfun(X, C(changed,:), distance, iter);
% Deal with clusters that have just lost all their members
empties = changed(m(changed) == 0);
if ~isempty(empties)
switch emptyact
case 'error'
error('stats:kmeans:EmptyCluster', ...
'Empty cluster created at iteration %d%s.',iter,repsMsg(rep,reps));
case 'drop'
% Remove the empty cluster from any further processing
D(:,empties) = NaN;
changed = changed(m(changed) > 0);
warning('stats:kmeans:EmptyCluster', ...
'Empty cluster created at iteration %d%s.',iter,repsMsg(rep,reps));
case 'singleton'
warning('stats:kmeans:EmptyCluster', ...
'Empty cluster created at iteration %d%s.',iter,repsMsg(rep,reps));
for i = empties
d = D((idx-1)*n + (1:n)'); % use newly updated distances
% Find the point furthest away from its current cluster.
% Take that point out of its cluster and use it to create
% a new singleton cluster to replace the empty one.
[dlarge, lonely] = max(d);
from = idx(lonely); % taking from this cluster
if m(from) < 2
% In the very unusual event that the cluster had only
% one member, pick any other non-singleton point.
from = find(m>1,1,'first');
lonely = find(idx==from,1,'first');
end
C(i,:) = X(lonely,:);
m(i) = 1;
idx(lonely) = i;
D(:,i) = distfun(X, C(i,:), distance, iter);
% Update clusters from which points are taken
[C(from,:), m(from)] = gcentroids(X, idx, from, distance);
D(:,from) = distfun(X, C(from,:), distance, iter);
changed = unique([changed from]);
end
end
end
% Compute the total sum of distances for the current configuration.
totsumD = sum(D((idx-1)*n + (1:n)'));
% Test for a cycle: if objective is not decreased, back out
% the last step and move on to the single update phase
if prevtotsumD <= totsumD
idx = previdx;
[C(changed,:), m(changed)] = gcentroids(X, idx, changed, distance);
iter = iter - 1;
break;
end
if display > 2 % 'iter'
disp(sprintf(dispfmt,iter,1,length(moved),totsumD));
end
if iter >= maxit
break;
end
% Determine closest cluster for each point and reassign points to clusters
previdx = idx;
prevtotsumD = totsumD;
[d, nidx] = min(D, [], 2);
% Determine which points moved
moved = find(nidx ~= previdx);
if ~isempty(moved)
% Resolve ties in favor of not moving
moved = moved(D((previdx(moved)-1)*n + moved) > d(moved));
end
if isempty(moved)
converged = true;
break;
end
idx(moved) = nidx(moved);
% Find clusters that gained or lost members
changed = unique([idx(moved); previdx(moved)])';
end % phase one
end % nested function
%------------------------------------------------------------------
function converged = onlineUpdate
% Initialize some cluster information prior to phase two
switch distance
case 'cityblock'
Xmid = zeros([k,p,2]);
for i = 1:k
if m(i) > 0
% Separate out sorted coords for points in i'th cluster,
% and save values above and below median, component-wise
Xsorted = sort(X(idx==i,:),1);
nn = floor(.5*m(i));
if mod(m(i),2) == 0
Xmid(i,:,1:2) = Xsorted([nn, nn+1],:)';
elseif m(i) > 1
Xmid(i,:,1:2) = Xsorted([nn, nn+2],:)';
else
Xmid(i,:,1:2) = Xsorted([1, 1],:)';
end
end
end
case 'hamming'
Xsum = zeros(k,p);
for i = 1:k
if m(i) > 0
% Sum coords for points in i'th cluster, component-wise
Xsum(i,:) = sum(X(idx==i,:), 1);
end
end
end
%
% Begin phase two: single reassignments
%
changed = find(m' > 0);
lastmoved = 0;
nummoved = 0;
iter1 = iter;
converged = false;
while iter < maxit
% Calculate distances to each cluster from each point, and the
% potential change in total sum of errors for adding or removing
% each point from each cluster. Clusters that have not changed
% membership need not be updated.
%
% Singleton clusters are a special case for the sum of dists
% calculation. Removing their only point is never best, so the
% reassignment criterion had better guarantee that a singleton
% point will stay in its own cluster. Happily, we get
% Del(i,idx(i)) == 0 automatically for them.
switch distance
case 'sqeuclidean'
for i = changed
mbrs = (idx == i);
sgn = 1 - 2*mbrs; % -1 for members, 1 for nonmembers
if m(i) == 1
sgn(mbrs) = 0; % prevent divide-by-zero for singleton mbrs
end
Del(:,i) = (m(i) ./ (m(i) + sgn)) .* sum((X - C(repmat(i,n,1),:)).^2, 2);
end
case 'cityblock'
for i = changed
if mod(m(i),2) == 0 % this will never catch singleton clusters
ldist = Xmid(repmat(i,n,1),:,1) - X;
rdist = X - Xmid(repmat(i,n,1),:,2);
mbrs = (idx == i);
sgn = repmat(1-2*mbrs, 1, p); % -1 for members, 1 for nonmembers
Del(:,i) = sum(max(0, max(sgn.*rdist, sgn.*ldist)), 2);
else
Del(:,i) = sum(abs(X - C(repmat(i,n,1),:)), 2);
end
end
case {'cosine','correlation'}
% The points are normalized, centroids are not, so normalize them
normC = sqrt(sum(C.^2, 2));
if any(normC < eps(class(normC))) % small relative to unit-length data points
error('stats:kmeans:ZeroCentroid', ...
'Zero cluster centroid created at iteration %d%s.',iter,repsMsg(rep,reps));
end
% This can be done without a loop, but the loop saves memory allocations
for i = changed
XCi = X * C(i,:)';
mbrs = (idx == i);
sgn = 1 - 2*mbrs; % -1 for members, 1 for nonmembers
Del(:,i) = 1 + sgn .*...
(m(i).*normC(i) - sqrt((m(i).*normC(i)).^2 + 2.*sgn.*m(i).*XCi + 1));
end
case 'hamming'
for i = changed
if mod(m(i),2) == 0 % this will never catch singleton clusters
% coords with an unequal number of 0s and 1s have a
% different contribution than coords with an equal
% number
unequal01 = find(2*Xsum(i,:) ~= m(i));
numequal01 = p - length(unequal01);
mbrs = (idx == i);
Di = abs(X(:,unequal01) - C(repmat(i,n,1),unequal01));
Del(:,i) = (sum(Di, 2) + mbrs*numequal01) / p;
else
Del(:,i) = sum(abs(X - C(repmat(i,n,1),:)), 2) / p;
end
end
end
% Determine best possible move, if any, for each point. Next we
% will pick one from those that actually did move.
previdx = idx;
prevtotsumD = totsumD;
[minDel, nidx] = min(Del, [], 2);
moved = find(previdx ~= nidx);
if ~isempty(moved)
% Resolve ties in favor of not moving
moved = moved(Del((previdx(moved)-1)*n + moved) > minDel(moved));
end
if isempty(moved)
% Count an iteration if phase 2 did nothing at all, or if we're
% in the middle of a pass through all the points
if (iter == iter1) || nummoved > 0
iter = iter + 1;
if display > 2 % 'iter'
disp(sprintf(dispfmt,iter,2,nummoved,totsumD));
end
end
converged = true;
break;
end
% Pick the next move in cyclic order
moved = mod(min(mod(moved - lastmoved - 1, n) + lastmoved), n) + 1;
% If we've gone once through all the points, that's an iteration
if moved <= lastmoved
iter = iter + 1;
if display > 2 % 'iter'
disp(sprintf(dispfmt,iter,2,nummoved,totsumD));
end
if iter >= maxit, break; end
nummoved = 0;
end
nummoved = nummoved + 1;
lastmoved = moved;
oidx = idx(moved);
nidx = nidx(moved);
totsumD = totsumD + Del(moved,nidx) - Del(moved,oidx);
% Update the cluster index vector, and the old and new cluster
% counts and centroids
idx(moved) = nidx;
m(nidx) = m(nidx) + 1;
m(oidx) = m(oidx) - 1;
switch distance
case 'sqeuclidean'
C(nidx,:) = C(nidx,:) + (X(moved,:) - C(nidx,:)) / m(nidx);
C(oidx,:) = C(oidx,:) - (X(moved,:) - C(oidx,:)) / m(oidx);
case 'cityblock'
for i = [oidx nidx]
% Separate out sorted coords for points in each cluster.
% New centroid is the coord median, save values above and
% below median. All done component-wise.
Xsorted = sort(X(idx==i,:),1);
nn = floor(.5*m(i));
if mod(m(i),2) == 0
C(i,:) = .5 * (Xsorted(nn,:) + Xsorted(nn+1,:));
Xmid(i,:,1:2) = Xsorted([nn, nn+1],:)';
else
C(i,:) = Xsorted(nn+1,:);
if m(i) > 1
Xmid(i,:,1:2) = Xsorted([nn, nn+2],:)';
else
Xmid(i,:,1:2) = Xsorted([1, 1],:)';
end
end
end
case {'cosine','correlation'}
C(nidx,:) = C(nidx,:) + (X(moved,:) - C(nidx,:)) / m(nidx);
C(oidx,:) = C(oidx,:) - (X(moved,:) - C(oidx,:)) / m(oidx);
case 'hamming'
% Update summed coords for points in each cluster. New
% centroid is the coord median. All done component-wise.
Xsum(nidx,:) = Xsum(nidx,:) + X(moved,:);
Xsum(oidx,:) = Xsum(oidx,:) - X(moved,:);
C(nidx,:) = .5*sign(2*Xsum(nidx,:) - m(nidx)) + .5;
C(oidx,:) = .5*sign(2*Xsum(oidx,:) - m(oidx)) + .5;
end
changed = sort([oidx nidx]);
end % phase two
end % nested function
end % main function
%------------------------------------------------------------------
function D = distfun(X, C, dist, iter)
%DISTFUN Calculate point to cluster centroid distances.
[n,p] = size(X);
D = zeros(n,size(C,1));
nclusts = size(C,1);
switch dist
case 'sqeuclidean'
for i = 1:nclusts
D(:,i) = (X(:,1) - C(i,1)).^2;
for j = 2:p
D(:,i) = D(:,i) + (X(:,j) - C(i,j)).^2;
end
% D(:,i) = sum((X - C(repmat(i,n,1),:)).^2, 2);
end
case 'cityblock'
for i = 1:nclusts
D(:,i) = abs(X(:,1) - C(i,1));
for j = 2:p
D(:,i) = D(:,i) + abs(X(:,j) - C(i,j));
end
% D(:,i) = sum(abs(X - C(repmat(i,n,1),:)), 2);
end
case {'cosine','correlation'}
% The points are normalized, centroids are not, so normalize them
normC = sqrt(sum(C.^2, 2));
if any(normC < eps(class(normC))) % small relative to unit-length data points
error('stats:kmeans:ZeroCentroid', ...
'Zero cluster centroid created at iteration %d.',iter);
end
for i = 1:nclusts
D(:,i) = max(1 - X * (C(i,:)./normC(i))', 0);
end
case 'hamming'
for i = 1:nclusts
D(:,i) = abs(X(:,1) - C(i,1));
for j = 2:p
D(:,i) = D(:,i) + abs(X(:,j) - C(i,j));
end
D(:,i) = D(:,i) / p;
% D(:,i) = sum(abs(X - C(repmat(i,n,1),:)), 2) / p;
end
end
end % function
%------------------------------------------------------------------
function [centroids, counts] = gcentroids(X, index, clusts, dist)
%GCENTROIDS Centroids and counts stratified by group.
[n,p] = size(X);
num = length(clusts);
centroids = NaN(num,p);
counts = zeros(num,1);
for i = 1:num
members = (index == clusts(i));
if any(members)
counts(i) = sum(members);
switch dist
case 'sqeuclidean'
centroids(i,:) = sum(X(members,:),1) / counts(i);
case 'cityblock'
% Separate out sorted coords for points in i'th cluster,
% and use to compute a fast median, component-wise
Xsorted = sort(X(members,:),1);
nn = floor(.5*counts(i));
if mod(counts(i),2) == 0
centroids(i,:) = .5 * (Xsorted(nn,:) + Xsorted(nn+1,:));
else
centroids(i,:) = Xsorted(nn+1,:);
end
case {'cosine','correlation'}
centroids(i,:) = sum(X(members,:),1) / counts(i); % unnormalized
case 'hamming'
% Compute a fast median for binary data, component-wise
centroids(i,:) = .5*sign(2*sum(X(members,:), 1) - counts(i)) + .5;
end
end
end
end % function
%------------------------------------------------------------------
function s = repsMsg(rep,reps)
% Utility for warning and error messages.
if reps == 1
s = '';
else
s = sprintf(' during replicate %d',rep);
end
end % function
?? 快捷鍵說明
復(fù)制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -