?? ams.m
字號:
function [sig,C,alpha,b] = ams(X,Y,method,sphere,opt)% AMS Automatic Model Selection for Kernel Methods with% - RBF Gaussian kernel% - L2 penalization of the errors% For classification: Support Vector Machines% For regression: Kernel Ridge Regression%% Usage: [sig,C,alpha,b] = AMS(X,Y,method,[sphere],[opt])%% X: is an n times d matrix containing n training points in d dimensions % Y: is a n dimensional vector containing either the class labels% (-1 or 1) in classification or the targets values in regression. % method is either:% 'loo': minimizes the leave-one-out error. The LOO is an accurate% estimate of the generalization error but might be difficult% to minimize (not smooth). Also, do not use with if sphere = 0% and large d (long time for gradient calculations).% 'rm': radius / margin bound. Easier to minimize. That can be the% first thing to try% 'val': minimizes a validation error. Good if there are a lot% a training points. The validation points are the% nv points of the training set [nv specified in the options]% 'ev': evidence maximization.% sphere: if 1, spherical RBF kernel = exp(-0.5*||x-y||^2 / sigma^2)% if 0, one sigma per component:% exp(-0.5*sum (x_i-y_i)^2/sigma_i^2)% [default 1]% opt: structure containing the options (in brackets default value)% itermax: maximum number of conjugate gradient steps [100] % tolX: gradient norm stopping criterion [1e-5]% hregul: regularization on the hyperparameters (to avoid extreme values) [0.01]% nonorm: if 1, do no not normalize the inputs [0]% verb: verbosity [1]% eta: smoothing parameter used in classif by 'ev' and 'loo'% larger is smoother [0.1]% sigmoid: slope of the sigmoid used by 'val' and 'loo' in classif [5] % nv: number of validation points for 'val' [n/2]%%% sig: the sigma(s) found by the model selection. Scalar if % sphere = 1, d dimensional vector otherwise% C: the constant penalizing the training errors% alpha,b: the parameters of the prediction function% f(x) = sum alpha_i K(x,x_i) + b % Copyright Olivier Chapelle% olivier.chapelle@tuebingen.mpg.de% last modified 21.04.2005 % Initialization [n,d] = size(X); if (size(Y,1)~=n) | (size(Y,2)~=1) error('Dimension error'); end; if all(abs(Y)==1) classif = 1; % We are in classification else classif = 0; % We are in regression end; if nargin < 5 % Assign the options to their default values opt = []; end; if ~isfield(opt,'itermax'), opt.itermax = 100; end; if ~isfield(opt,'length'), opt.length = opt.itermax; end; if ~isfield(opt,'tolX'), opt.tolX = 1e-5; end; if ~isfield(opt,'hregul'), opt.hregul = 1e-2; end; if ~isfield(opt,'nonorm'), opt.nonorm = 0; end; if ~isfield(opt,'verb'), opt.verb = 0; end; if ~isfield(opt,'eta'), opt.eta = 0.1; end; if ~isfield(opt,'sigmoid'), opt.sigmoid = 5; end; if ~isfield(opt,'nv'), opt.nv = round(n/2); end; if nargin<4 sphere = 1; end; % Normalization if ~opt.nonorm & (norm(std(X)-1) > 1e-5*d) X = X./repmat(std(X),size(X,1),1); fprintf(['The data have been normalized (each component is divided by ' ... 'its standard deviation).\n Don''t forget to do the same for the ' ... 'test set.\n']); end; % Default values of the hyperparameters (in log scale) C = 0; sig = .3*var(X)'; if ~sphere, sig = log(d*sig) / 2; else sig = log(sum(sig)) / 2; end; % Do the optimization on the hyperparameters param = [sig; C]; default_param = param;% check_all_grad(param,X,Y,classif,default_param,opt); return;% plot_criterion(X,Y,param,[-3:0.1:3],classif,default_param,opt);% return; [param,fX] = minimize(param,@obj_fun,opt,X,Y,method,classif,default_param,opt); % Prepare the outputs [alpha, b] = learn(X,Y,param,classif); C = exp(param(end)); sig = exp(param(1:end-1));function plot_criterion(X,Y,param,range,classif,opt) for i=1:length(range) fprintf('%f\r',range(i)); for j=1:2 param2 = param; param2(end-j+1) = param2(end-j+1)+range(i); obj(1,i,j) = obj_fun(param2,X,Y,'loo',classif,opt); obj(2,i,j) = obj_fun(param2,X,Y,'rm', classif,opt); obj(3,i,j) = obj_fun(param2,X,Y,'val',classif,opt); obj(4,i,j) = exp(obj_fun(param2,X,Y,'ev', classif,opt)); end; end; figure; plot(obj(:,:,1)'); figure; plot(obj(:,:,2)');function check_all_grad(param,X,Y,classif,opt) param = randn(size(param)); checkgrad(@obj_fun,param,1e-6,X,Y,'loo',classif,opt) checkgrad(@obj_fun,param,1e-6,X,Y,'rm', classif,opt) checkgrad(@obj_fun,param,1e-6,X,Y,'val',classif,opt) checkgrad(@obj_fun,param,1e-6,X,Y,'ev', classif,opt) function [obj, grad] = obj_fun(param,X,Y,meth,classif,default_param,opt)% Compute the model selection criterion and its derivatives with% respect to param which is a vector containing in log scale sigma% and C % Add a regularization on the hyperparameters to avoid that % they take extreme values obj0 = opt.hregul * mean((param-default_param).^2); grad0 = 2*opt.hregul * (param-default_param) / length(param); if obj0 > 1 % Extreme value of hyperparamaters. The rest obj = obj0; % might not even work because of numercial grad = grad0; % instabilities -> exit return; end if strcmp(meth,'val') % Remove the validation set for the learning [alpha,b] = learn(X(1:end-opt.nv,:),Y(1:end-opt.nv),param,classif); else [alpha,b] = learn(X,Y,param,classif); end; %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % Compute the objective function % %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% sv = find(alpha~=0); % Find the set of support vectors switch lower(meth) % % Leave-one-out % case 'loo' K = compute_kernel(X(sv,:),[],param); K(end+1,:) = 1; % To take the bias into account K(:,end+1) = 1; K(end,end) = 0; D = opt.eta./abs(alpha(sv)); invKD = inv(K+diag([D;0])); invKD(:,end) = []; invKD(end,:) = []; span = 1./diag(invKD) - D; % "Span" of the support vectors % Estimated change of the output of each point when it's % removed from the training set. change = alpha(sv).*span; if classif out = 1-Y(sv).*change; obj = sum(tanh(-opt.sigmoid*out)+1)/2/length(Y); else obj = mean(change.^2); end; invK = inv(K); invK(:,end) = []; invK(end,:) = []; % % Radius margin % case 'rm' K = compute_kernel(X,[],param); w2 = sum(alpha.*Y); if classif R2 = mean(diag(K)) - mean(mean(K)); % Radius estimated by variance else K(end+1,:) = 1; % Radius = mean span K(:,end+1) = 1; K(end,end) = 0; invK = inv(K); span = 1./diag(invK); R2 = mean(span(1:end-1)); end; obj = R2*w2 / length(Y); % % Validation set % case 'val' K = compute_kernel(X(1:end-opt.nv,:),X(end-opt.nv+1:end,:),param); Yv = Y(end-opt.nv+1:end); out = K(end-opt.nv+1:end,:)*alpha+b; % Output on the validation points if classif % Goes through sigmoid because step function is not differentiable obj = mean(tanh(-opt.sigmoid*out.*Yv)+1)/2; else obj = mean((out-Yv).^2); end; Kl = K(sv,sv); Kl(end+1,:) = 1; Kl(:,end+1) = 1; Kl(end,end) = 0; invK = inv(Kl); % % Evidence maximization % case 'ev' K = compute_kernel(X,[],param); if classif % Normally, K should be computed only on the support vectors. But % then, the evidence wouldn't be continuous. We thus make a soft % transition for small alphas. weight = min(0, (alpha.*Y)/(mean(alpha.*Y))/opt.eta - 1); weight2 = 1 - weight.^2; K0 = K; K = K.*(max(weight2*weight2',eye(length(weight2)))); Kl = K0(sv,sv); Kl(end+1,:) = 1; Kl(:,end+1) = 1; Kl(end,end) = 0; invKl = inv(Kl); invKl(:,end) = []; invKl(end,:) = []; end; [invK, logdetK] = inv_logdet_pd_(K); obj = logdetK/length(K) + log(mean(alpha.*Y)); otherwise error('Unknown model selection criterion'); end; %%%%%%%%%%%%%%%%%%%%%%%%% % Compute the gradients % %%%%%%%%%%%%%%%%%%%%%%%%% for i=1:length(param) switch lower(meth) % % Leave-one-out % case 'loo' dK = compute_kernel(X(sv,:),[],param,i); dalpha = -invK * (dK*alpha(sv)); dD = - opt.eta * dalpha.*sign(alpha(sv))./alpha(sv).^2; dspan = diag(invKD).^(-2).* ... diag(invKD*(dK+diag(dD))*invKD) - dD; dchange = dalpha.*span + alpha(sv).*dspan; if classif dout = -Y(sv).*dchange; grad(i) = -0.5*opt.sigmoid * ... sum(cosh(-opt.sigmoid*out).^(-2).*dout)/length(Y); else grad(i) = 2*mean(change.*dchange); end; % % Radius margin % case 'rm' dK = compute_kernel(X,[],param,i); dw2 = -alpha'*dK*alpha; if classif dR2 = mean(diag(dK)) - mean(mean(dK)); else dspan = diag(invK).^(-2).*diag(invK(:,1:end-1)*dK*invK(1:end-1,:)); dR2 = mean(dspan(1:end-1)); end; grad(i) = (dR2*w2 + R2*dw2) / length(Y); % % Validation set % case 'val' dK = compute_kernel(X(1:end-opt.nv,:),X(end-opt.nv+1:end,:),param,i); dalpha = -invK(:,1:end-1) * (dK(sv,:)*alpha); db = dalpha(end); dalpha = dalpha(1:end-1); dout = K(end-opt.nv+1:end,sv)*dalpha + db + ... dK(end-opt.nv+1:end,:) * alpha; if classif grad(i) = -0.5*opt.sigmoid * ... mean(cosh(-opt.sigmoid*out.*Yv).^(-2).*dout.*Yv); else grad(i) = 2*mean((out-Yv).*dout); end; % % Evidence maximization % case 'ev' dK = compute_kernel(X,[],param,i); dK0 = dK; if classif dalpha = zeros(size(alpha)); dalpha(sv) = -invKl * (dK(sv,:)*alpha); dweight = -2/opt.eta * weight.* ... (dalpha.*Y/mean(alpha.*Y) - (alpha.*Y) * mean(dalpha.*Y)/mean(alpha.*Y)^2); dK = dK .* max(weight2*weight2',eye(length(weight))) + ... (K0-diag(diag(K0))) .* (weight2*dweight' + dweight*weight2'); end; grad(i) = invK(:)'*dK(:)/length(K) - ... alpha'*dK0*alpha / sum(alpha.*Y); end; end; obj = obj0 + obj; grad = grad0 + grad'; function [alpha,b] = learn(X,Y,param,classif)% Do the actual learning (SVM or kernel ridge regression) K = compute_kernel(X,[],param); svset = 1:length(Y); old_svset = []; iter = 0; itermax = 20; % If the set of support vectors has changed, we need to % reiterate. Note that for regression, all points are support % vectors and there is only one iteration. while ~isempty(setxor(svset,old_svset)) & (iter<itermax) old_svset = svset; H = K(svset,svset); H(end+1,:) = 1; % To take the bias into account H(:,end+1) = 1; H(end,end) = 0; % Find the parameters par = H\[Y(svset);0]; alpha = zeros(length(Y),1); alpha(svset) = par(1:end-1)'; b = par(end); % Compute the new set of support vectors if classif % Compute the outputs by removing the ridge in K out = K*alpha+b - alpha/exp(param(end)); svset = find(Y.*out < 1); end; iter = iter + 1; end; if iter == itermax warning('Maximum number of Newton steps reached'); elseif classif & any(alpha.*Y<0) warning('Bug in learn'); end; function K = compute_kernel(X,Y,param,diff)% Compute the RBF kernel matrix between [X;Y] and X.% if diff is given, compute the derivate of the kernel matrix% with respect to the diff-th parameter of param. global K0 % Keep the kernel matrix in memory; we'll need to % compute the derivatives. Z = [X; Y]; Y = X; X = Z; clear Z; ridge = exp(-param(end)); if nargin<4 % Compute the kernel matrix % Divide by the length scales if length(param)==2 X = X * exp(-param(1)); Y = Y * exp(-param(1)); else X = X .* repmat(exp(-param(1:end-1)'),size(X,1),1); Y = Y .* repmat(exp(-param(1:end-1)'),size(Y,1),1); end; normX = sum(X.^2,2); normY = sum(Y.^2,2); K0 = exp(-0.5*(repmat(normX ,1,size(Y,1)) + ... repmat(normY',size(X,1),1) - 2*X*Y')); K = K0 + eye(size(K0))*ridge; else % Compute the derivate if diff == length(param) % With respect to log(C) K = - ridge*eye(size(K0)); elseif length(param) == 2 % With respect to spherical log(sig) K = -2*K0.*log(K0+eps); else % With respect to log(sig_i) dist = repmat(X(:,diff),1,size(Y,1)) - ... repmat(Y(:,diff)',size(X,1),1); K = K0 .* (dist/exp(param(diff))).^2; end; end;function [invA, logdetA] = inv_logdet_pd_(A) % For those who don't have the inv_logdet_pd written by Carl if exist('inv_logdet_pd') [invA, logdetA] = inv_logdet_pd(A); else invA = inv(A); C = chol(A); logdetA = 2*sum(log(diag(C))); end;
?? 快捷鍵說明
復制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -