% DPS Correntropy based, hierarchical density preserving data split % % R = DPS(A,LEVELS,CLASSWISE) % [R H] = DPS(A,LEVELS,CLASSWISE) % % INPUT % A Input dataset % LEVELS Number of split levels, default: 3 % CLASSWISE Use (1, default) or ignore (0) label information % % OUTPUT % R Index array with rotation set with 2^LEVELS folds % H Hierarchy of splits % % DESCRIPTION % Density Preserving Sampling (DPS) divides the input dataset into a given % number of folds (2^LEVELS) by maximizing the correntropy between the folds % and can be used as an alternative for cross-validation. The procedure is % deterministic, so unlike cross-validation it does not need to be repeated. % % REFERENCE % M. Budka, B. Gabrys, Correntropy-based density-preserving data sampling % as an alternative to standard cross-validation, IJCNN2010, 1-8 % http://www.budka.co.uk/ % % SEE ALSO (PRTools Guide) % DATASETS, CROSSVAL function [R H] = dps(A,levels,classwise) if (nargin<3) || isempty(classwise), classwise = 1; end if (nargin<2) || isempty(levels), levels = 3; end islabtype(A,'crisp'); H = zeros(levels,size(A,1)); idxs = cell(1,levels+1); idxs{1} = {1:size(A,1)}; for i = 1:levels for j = 1:2^(i-1) t = helper(A(idxs{i}{j},:),classwise); idxs{i+1}{2*j-1} = idxs{i}{j}(t{1}); idxs{i+1}{2*j} = idxs{i}{j}(t{2}); end for j = 1:length(idxs{i+1}) H(i,idxs{i+1}{j}) = j; end end R = H(end,:); end function idxs = helper(A,classwise) % if the classes aro too small to be divided further, switch to classless mode if classwise && all(classsizes(A)>2), c = getsize(A,3); else classwise = 0; c = 1; end siz = zeros(2,1); idx = cell(1,c); for i = 1:c if classwise, [B BI] = seldat(A,i); % select i-th class else B = A; BI = (1:size(A,1)); end m = length(BI); mask = true(1,m); % mask is used for counting remaining objects D = +distm(B) + diag(inf(m,1)); % working distance matrix Dorg = D; % original distance matrix idx{i} = nan(2,ceil(m/2)); for j = 1:floor(m/2) [mD,I] = min(D,[],1); % \ [mmD,J] = min(mD); % find two closest objects I = I(J(1)); J = J(1); % / mask(I) = 0; mask(J) = 0; % mark them as used % split the objects to maximally increase coverage of both subsets if (mean(Dorg(I,idx{i}(1,1:j-1))) + mean(Dorg(J,idx{i}(2,1:j-1))) < ... mean(Dorg(I,idx{i}(2,1:j-1))) + mean(Dorg(J,idx{i}(1,1:j-1)))) idx{i}(1,j) = J; idx{i}(2,j) = I; else idx{i}(1,j) = I; idx{i}(2,j) = J; end % remove used objects from the distance matrix D(I,:) = inf; D(:,I) = inf; D(J,:) = inf; D(:,J) = inf; end if isempty(j), j = 0; end % in case the loop is not entered at all % odd number of points in class if sum(mask)>0 I = find(mask); if siz(1)siz(2) idx{i}(2,end) = I; else if (mean(Dorg(I,idx{i}(1,1:j))) < mean(Dorg(I,idx{i}(2,1:j)))) idx{i}(2,j+1) = I; else idx{i}(1,j+1) = I; end end end % convert indexes from class-specific to dataset-specific idx{i}(1,~isnan(idx{i}(1,:))) = BI(idx{i}(1,~isnan(idx{i}(1,:)))); idx{i}(2,~isnan(idx{i}(2,:))) = BI(idx{i}(2,~isnan(idx{i}(2,:)))); % update fold sizes siz(1) = siz(1) + sum(~isnan(idx{i}(1,:))); siz(2) = siz(2) + sum(~isnan(idx{i}(2,:))); end idx = cell2mat(idx); idxs = cell(1,2); idxs{1} = idx(1,~isnan(idx(1,:))); idxs{2} = idx(2,~isnan(idx(2,:))); end