function [ width ] = NicForest_CalculateWidth(xtrain,ytrain,opt) %NICFOREST_CALCULATEWIDTH Calculate width using a mini-forest or %regression % [ width ] = NicForest_CalculateWidth(xtrain,ytrain,opt) calculates a % reasonable starting value for the width parameter for the ensemble % forest development. % % Inputs: % xtrain - Training features % ytrain - Training targets % opt - Number of trees to be used in development % % Outputs: % opt.Width - Scalar initial value for the intercept prior's width % % Example % [ width ] = NicForest_CalculateWidth(xtrain) % % See also NICFOREST NICFOREST_TRAIN % $LastChangedBy: alistair $ % $LastChangedDate: 2012-05-30 12:21:30 +0100 (Wed, 30 May 2012) $ % $Revision: 21 $ % Originally written on GLNXA64 by Alistair Johnson, 09-May-2012 16:26:13 % Contact: alistairewj@gmail.com Ntrees = opt.Trees; num_tar = numel(unique(ytrain)); if num_tar==1 error('Only one class provided'); elseif num_tar==2 % binary classification %=== Do a quick MCMC to find a reasonable width opt = forest_opt_set(opt,... 'Iterations',20000,... 'Save',2000,... 'Resets', 1,... 'UpdatedTrees', 2,... 'BurnIn', 20, ... 'Width', []); %=== Split into 2 folds + train 2 models idxSplit = false(size(xtrain,1),1); idxSplit(1:2:end) = true; % group=opt.Group; % group_uniq = unique(group,'first'); %=== if there is more than 1 group, ignore it for width estimation % this is probably temporary opt1 = opt; opt2 = opt; N1 = sum(idxSplit); N2 = numel(idxSplit)-N1; opt1.Group = ones(N1,1); opt2.Group = ones(N2,1); [ forests1 ] = NicForest_train(xtrain(idxSplit,:),ytrain(idxSplit,:),opt1); [ forests2 ] = NicForest_train(xtrain(~idxSplit,:),ytrain(~idxSplit,:),opt2); ypred = zeros(size(ytrain,1),1); [ ypred1 ] = NicForest_apply_quick( forests1 , xtrain(~idxSplit,:) ); ypred(~idxSplit) = ypred1; [ ypred2 ] = NicForest_apply_quick( forests2 , xtrain(idxSplit,:) ); ypred(idxSplit) = ypred2; % width^2*Ntrees/4 ~ var(logit(Pi)) where Pi is the pred from reasonable model. width = 2*sqrt(var(logit(ypred))/Ntrees); else width = 2*Ntrees^-0.5; % ytrain variance is normalized to be 1 in fcn % width = sqrt(4*var(ytrain)/Ntrees); end end function [p] = logit(p) % logit p = log(p) - log(1-p); end