function [ forests ] = NicForest_train_parallel(xtrain,ytrain,Ntrees,width, opt) % [ forests , ytrain_pred ] = NicForest_train_parallel(xtrain,ytrain,NTrees,width, opt) % Trains a forest using specified NTrees, Width, and MCMC parameters % stored in opt, in parallel using parfor. % $LastChangedBy: alistair $ % $LastChangedDate: 2012-05-16 13:48:37 +0100 (Wed, 16 May 2012) $ % $Revision: 10 $ % Originally written on GLNXA64 by Alistair Johnson, 09-May-2012 16:26:13 % Contact: alistairewj@gmail.com %% Initialise parameters [N,P]=size(xtrain); if nargin>4 MCCite=opt.MCCite; % number of Iterations for each MCMC repetition MCCsave=opt.MCCsave; % Save forest every MCCsave MCCres=opt.MCCres; % Number of reset during MCCres iterations MCCnbr = MCCite*MCCres; % next two are chosen on therotical grounds based on the data % Get refereence and try to implement automated parameters here % width^2*Ntrees/2 ~ var(logit(Pi)) where Pi is the pred from reasonable model. % Ntrees=500; NtreesUpdate = opt.NtreesUpdate; else %=== defaults MCCite=100000; % number of Iterations for each MCMC repetition MCCsave=2e3; % Save forest every MCCsave MCCres=2; % Number of reset during MCCres iterations MCCnbr = MCCite*MCCres; % next two are chosen on therotical grounds based on the data % Get refereence and try to implement automated parameters here % width^2*Ntrees/2 ~ var(logit(Pi)) where Pi is the pred from reasonable model. % Ntrees=500; NtreesUpdate = 2; end %=== Rank and replace training data xtrain = tiedrankrelative(xtrain); NaNNbre = sum( ~isnan(xtrain) , 1 ) + 1; %+1 is to symetrize the ratio number of observation/NaNNbre %=== Determine method: regression or classification num_tar = numel(unique(ytrain)); if num_tar==1 error('Only one class provided'); elseif num_tar==2 likelihoodFcn = 'll'; %=== set default intercept+sigma intercept = log(1/(-1+1/mean(ytrain))); sigma = 0; else likelihoodFcn = 'normll'; intercept = mean(ytrain); sigma = sqrt(var(ytrain)); end %% Normalize data % squish data by assuming it is drawn from a normal distribution %=== Scale ranks between 0->1 xtrain_normalized = bsxfun(@rdivide,xtrain,NaNNbre); %=== Squish into normal distribution! xtrain_normalized=norminv(xtrain_normalized,0,1); %=== Initialize forest %=== Create cell - each cell contains independent MCMC runs forests_cell = cellfun(@(x) zeros(floor(MCCite/MCCsave*4/5), 11, Ntrees+1), cell(MCCres,1), 'UniformOutput',false); priors_cell = cellfun(@(x) zeros(floor(MCCite/MCCsave*4/5), 2), cell(MCCres,1), 'UniformOutput',false); % goes, number of forest % 11 paramers in each tree % trees+1 is number of trees in the forest + 1 is intercept % (b0 in regression) %% Monte carlo count=0; for r=1:MCCres %=========================% %=== RESET MONTE CARLO ===% %=========================% %=== Initialize forest [forest adds sums] = initialize_forest( Ntrees , P , ytrain ); BurnInFlag = false; %=== Initialize priors + add to adds/sums prior = zeros(2,MCCite+1); % 2xMCCite - [intercept; sigma] prior(1,1) = intercept; % initial sigma prior(2,1) = sigma; % initial sigma %=== for efficiency, generate all the random variables used to update %priors before the MCMC loop prior(2,2:end) = normrnd(0,width,1,size(prior,2)-1); prior(2,2:end) = gamrnd(1000000,1/1000000,1,size(prior,2)-1); % prior(2,2:end) = 0.02*rand(1,size(prior,2)-1) - 0.01; % uniform distribution adds(:,Ntrees+1)=prior(1,1); sums=sums+prior(1,1); %=== Keep track of the variables used in vector varUsed = zeros(3,P); for k=1:3 varUsed(k,:) = hist(forest(k,:),1:P); % count number of variables used in forest end %=========================% %=== BEGIN MONTE CARLO ===% %=========================% for j=1:MCCite % for each MCMC iteration % Create the next forest candidate cForest = forest; prior(1,j+1)=prior(1,j)+prior(1,j+1); % randomize intercept prior(2,j+1)=abs(prior(2,j)*prior(2,j+1)); % randomize sigma v=zeros(N,NtreesUpdate); tempsums=sums+prior(1,j+1)-prior(1,j); % current tree intercept contribution UpdatedTreesNbr= randsample(Ntrees,NtreesUpdate,0); % randomly selected 2 trees % each iteration considers 2 random trees to update for i=1:NtreesUpdate % Update tree in the forest cForest(:,UpdatedTreesNbr(i)) = update_tree( forest(:,UpdatedTreesNbr(i)) ,.5 , width , P, varUsed) ; % Apply tree v(:,i) = apply_tree( cForest(:,UpdatedTreesNbr(i)) , xtrain , NaNNbre, xtrain_normalized ); % temp score matrix where to update only the two changed trees % where adds is he old contribution (overwriten 2 trees) tempsums=tempsums+v(:,i)-adds(:,UpdatedTreesNbr(i)); end % Compute log likelyhood for current and previous forest e1=feval(likelihoodFcn,sums,ytrain,prior(2,j)); % log likelihood previous forest e2=feval(likelihoodFcn,tempsums,ytrain,prior(2,j+1)); % log likelihood new forest % Metropolis accepting step if(rand(1)= (MCCite/5) % >= 20% into sampling % if MCCsave = 2000, MCCres = 40, then never in burn-in period BurnInFlag=true; % no longer in burn-in period. end if BurnInFlag count=count+1; forests_cell{r}(count,:,:)=forest; priors_cell{r}(count,:)= prior(:,j+1); end fprintf('%2.0f%% complete.\n',(j+(r-1)*MCCite)/MCCnbr*100); end end %=== Remove extra forests which occur when parameters MCCSave + MCCIte don't match if count < size(forests_cell{r},1) forests_cell{r} = forests_cell{r}(1:count,:,:); end end %=== Create forests/priors forests = cell2mat(forests_cell); priors = cell2mat(priors_cell); %=== Add intercept prior to end of forest matrix forests = cat(3,forests,repmat(priors(:,1),1,11)); % if num_tar>2 % forests = cat(3,forests,repmat(priors(:,2),1,11)); % sigma % end end function [p] = ll(pred,tar,sigma) % log likelihood pred = invlogit(pred); p = sum(-tar.*log(pred)-(1-tar).*log(1-pred)); end function [p] = normll(pred,tar,sigma) % log likelihood p = log(sigma)*numel(pred) + 0.5*sum((tar-pred).^2)/(sigma^2); end function [p] = invlogit(p) % inverse logit p = 1./(1+exp(-p)); end function [forest adds sums] = initialize_forest( Ntrees , Nvar , ytrain ) forest=zeros(11,Ntrees); %initialize matrix of trees (current forest) N=length(ytrain); for i=1:3 forest(i,1:Ntrees) = randsample(1:Nvar,Ntrees,1); end forest(7:9,:)=0.5; adds=zeros(N,Ntrees+1); sums=zeros(N,1); end function cT = update_tree(cT,p,width,varNum,varUsed) % tree is a vector has the following parameters % - 1-3: Variables indices for first three nodes % - 4-5: Threshods for nodes 1 and 2 % - 6: slope % - 7-9 : missing value param for nodes 1 to 3 % - 10 : architechture type (1,2,3,4), i.e. location of the final node in the tree % - 11: intercept % ramdomly create a entirely new tree or a similar to speed up p2 = 0.5; % Probability of propagating a variable towards bottom of the tree leakage = 0.1; % Probability of ignoring weights for variable selection varTemp = sum(varUsed(1:2,:),1); if(rand(1)