% function [delta, psi, qt] = viterbiDecodePCG_Springer(observation_sequence, pi_vector, b_matrix, total_obs_distribution, heartrate, systolic_time, Fs, figures)
%
% This function calculates the delta, psi and qt matrices associated with
% the Viterbi decoding algorithm from:
% L. R. Rabiner, "A tutorial on hidden Markov models and selected
% applications in speech recognition," Proc. IEEE, vol. 77, no. 2, pp.
% 257-286, Feb. 1989.
% using equations 32a - 35, and equations 68 - 69 to include duration
% dependancy of the states.
%
% This decoding is performed after the observation probabilities have been
% derived from the logistic regression model of Springer et al:
% D. Springer et al., "Logistic Regression-HSMM-based Heart Sound
% Segmentation," IEEE Trans. Biomed. Eng., In Press, 2015.
%
% Further, this function is extended to allow the duration distributions to extend
% past the beginning and end of the sequence. Without this, the label
% sequence has to start and stop with an "entire" state duration being
% fulfilled. This extension takes away that requirement, by allowing the
% duration distributions to extend past the beginning and end, but only
% considering the observations within the sequence for emission probability
% estimation. More detail can be found in the publication by Springer et
% al., mentioned above.
%
%% Inputs:
% observation_sequence: The observed features
% pi_vector: the array of initial state probabilities, dervived from
% "trainSpringerSegmentationAlgorithm".
% b_matrix: the observation probabilities, dervived from
% "trainSpringerSegmentationAlgorithm".
% heartrate: the heart rate of the PCG, extracted using
% "getHeartRateSchmidt"
% systolic_time: the duration of systole, extracted using
% "getHeartRateSchmidt"
% Fs: the sampling frequency of the observation_sequence
% figures: optional boolean variable to show figures
%
%% Outputs:
% logistic_regression_B_matrix:
% pi_vector:
% total_obs_distribution:
% As Springer et al's algorithm is a duration dependant HMM, there is no
% need to calculate the A_matrix, as the transition between states is only
% dependant on the state durations.
%
%% Copyright (C) 2016  David Springer
% dave.springer@gmail.com
%
% This program is free software: you can redistribute it and/or modify
% it under the terms of the GNU General Public License as published by
% the Free Software Foundation, either version 3 of the License, or
% any later version.
%
% This program is distributed in the hope that it will be useful,
% but WITHOUT ANY WARRANTY; without even the implied warranty of
% MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
% GNU General Public License for more details.
%
% You should have received a copy of the GNU General Public License
% along with this program.  If not, see <http://www.gnu.org/licenses/>.

function [delta, psi, qt] = viterbiDecodePCG_Springer(observation_sequence, pi_vector, b_matrix, total_obs_distribution, heartrate, systolic_time, Fs,figures)

if nargin < 8
    figures = false;
end

%% Preliminary
springer_options = default_Springer_HSMM_options;

T = length(observation_sequence);
N = 4; % Number of states

% Setting the maximum duration of a single state. This is set to an entire
% heart cycle:
max_duration_D = round((1*(60/heartrate))*Fs);

%Initialising the variables that are needed to find the optimal state path along
%the observation sequence.
%delta_t(j), as defined on page 264 of Rabiner, is the best score (highest
%probability) along a single path, at time t, which accounts for the first
%t observations and ends in State s_j. In this case, the length of the
%matrix is extended by max_duration_D samples, in order to allow the use
%of the extended Viterbi algortithm:
delta = ones(T+ max_duration_D-1,N)*-inf;

%The argument that maximises the transition between states (this is
%basically the previous state that had the highest transition probability
%to the current state) is tracked using the psi variable.
psi = zeros(T+ max_duration_D-1,N);

%An additional variable, that is not included on page 264 or Rabiner, is
%the state duration that maximises the delta variable. This is essential
%for the duration dependant HMM.
psi_duration =zeros(T + max_duration_D-1,N);

%% Setting up observation probs
observation_probs = zeros(T,N);

for n = 1:N
    
    %MLR gives P(state|obs)
    %Therefore, need Bayes to get P(o|state)
    %P(o|state) = P(state|obs) * P(obs) / P(states)
    %Where p(obs) is derived from a MVN distribution from all
    %obserbations, and p(states) is taken from the pi_vector:
    pihat = mnrval(cell2mat(b_matrix(n)),observation_sequence(:,:));
    
    for t = 1:T
        
        Po_correction = mvnpdf(observation_sequence(t,:),cell2mat(total_obs_distribution(1)),cell2mat(total_obs_distribution(2)));
        
        %When saving the coefficients from the logistic
        %regression, it orders them P(class 1) then P(class 2). When
        %training, I label the classes as 0 and 1, so the
        %correct probability would be pihat(2).
        
        observation_probs(t,n) = (pihat(t,2)*Po_correction)/pi_vector(n);
        
    end
end

%% Setting up state duration probabilities, using Gaussian distributions:
[d_distributions, max_S1, min_S1, max_S2, min_S2, max_systole, min_systole, max_diastole, min_diastole] = get_duration_distributions(heartrate,systolic_time);


duration_probs = zeros(N,3*Fs);
duration_sum = zeros(N,1);
for state_j = 1:N
    for d = 1:max_duration_D
        if(state_j == 1)
            duration_probs(state_j,d) = mvnpdf(d,cell2mat(d_distributions(state_j,1)),cell2mat(d_distributions(state_j,2)));
            
            if(d < min_S1 || d > max_S1)
                duration_probs(state_j,d)= realmin;
            end
            
            
        elseif(state_j==3)
            duration_probs(state_j,d) = mvnpdf(d,cell2mat(d_distributions(state_j,1)),cell2mat(d_distributions(state_j,2)));
            
            if(d < min_S2 || d > max_S2)
                duration_probs(state_j,d)= realmin;
            end
            
            
        elseif(state_j==2)
            
            duration_probs(state_j,d) = mvnpdf(d,cell2mat(d_distributions(state_j,1)),cell2mat(d_distributions(state_j,2)));
            
            if(d < min_systole|| d > max_systole)
                duration_probs(state_j,d)= realmin;
            end
            
            
        elseif (state_j==4)
            
            duration_probs(state_j,d) = mvnpdf(d,cell2mat(d_distributions(state_j,1)),cell2mat(d_distributions(state_j,2)));
            
            if(d < min_diastole ||d > max_diastole)
                duration_probs(state_j,d)= realmin;
            end
        end
    end
    duration_sum(state_j) = sum(duration_probs(state_j,:));
end


if(length(duration_probs)>3*Fs)
    duration_probs(:,(3*Fs+1):end) = [];
end

if(figures)
    figure('Name', 'Duration probabilities');
    plot(duration_probs(1,:)./ duration_sum(1),'Linewidth',2);
    hold on;
    plot(duration_probs(2,:)./ duration_sum(2),'r','Linewidth',2);
    hold on;
    plot(duration_probs(3,:)./ duration_sum(3),'g','Linewidth',2);
    hold on;
    plot(duration_probs(4,:)./ duration_sum(4),'k','Linewidth',2);
    hold on;
    legend('S1 Duration','Systolic Duration','S2 Duration','Diastolic Duration');
    pause();
end
%% Perform the actual Viterbi Recursion:


qt = zeros(1,length(delta));
%% Initialisation Step

%Equation 32a and 69a, but leave out the probability of being in
%state i for only 1 sample, as the state could have started before time t =
%0.

delta(1,:) = log(pi_vector) + log(observation_probs(1,:)); %first value is the probability of intially being in each state * probability of observation 1 coming from each state

%Equation 32b
psi(1,:) = -1;


% The state duration probabilities are now used.
%Change the a_matrix to have zeros along the diagonal, therefore, only
%relying on the duration probabilities and observation probabilities to
%influence change in states:
%This would only be valid in sequences where the transition between states
%follows a distinct order.
a_matrix = [0,1,0,0;0 0 1 0; 0 0 0 1;1 0 0 0];


%% Run the core Viterbi algorith

if(springer_options.use_mex)
    
    %% Run Mex code
    % Ensure you have run the mex viterbi_PhysChallenge.c code on the
    % native machine before running this:
    [delta, psi, psi_duration] = viterbi_Springer(N,T,a_matrix,max_duration_D,delta,observation_probs,duration_probs,psi, duration_sum);
    
    
else
    
    %% Recursion
    
    %% The Extended Viterbi algorithm:
    
    %Equations 33a and 33b and 69a, b, c etc:
    %again, ommitting the p(d), as state could have started before t = 1
    
    % This implementation extends the standard implementation of the
    % duration-dependant Viterbi algorithm by allowing the durations to
    % extend beyond the start and end of the time series, thereby allowing
    % states to "start" and "stop" outside of the recorded signal. This
    % addresses the issue of partial states at the beginning and end of the
    % signal being labelled as the incorrect state. For instance, a
    % short-duration diastole at the beginning of a signal looks a lot like
    % systole, and can lead to labelling errors.
    
    % t spans input 2 to T + max_duration_D:
    
    
    for t = 2:T+ max_duration_D-1
        for j = 1:N
            for d = 1:1:max_duration_D
                
                
                %The start of the analysis window, which is the current time
                %step, minus d (the time horizon we are currently looking back),
                %plus 1. The analysis window can be seen to be starting one
                %step back each time the variable d is increased.
                % This is clamped to 1 if extending past the start of the
                % record, and T-1 is extending past the end of the record:
                start_t = t - d;
                if(start_t<1)
                    start_t = 1;
                end
                if(start_t > T-1)
                    start_t = T-1;
                end
                
                %The end of the analysis window, which is the current time
                %step, unless the time has gone past T, the end of the record, in
                %which case it is truncated to T. This allows the analysis
                %window to extend past the end of the record, so that the
                %timing durations of the states do not have to "end" at the end
                %of the record.
                end_t = t;
                if(t>T)
                    end_t = T;
                end
                
                
                %Find the max_delta and index of that from the previous step
                %and the transition to the current step:
                %This is the first half of the expression of equation 33a from
                %Rabiner:
                [max_delta, max_index] = max(delta(start_t,:)+log(a_matrix(:,j))');
                               
                
                %Find the normalised probabilities of the observations over the
                %analysis window:
                probs = prod(observation_probs(start_t:end_t,j));
                
                
                %Find the normalised probabilities of the observations at only
                %the time point at the start of the time window:
                
                if(probs ==0)
                    probs = realmin;
                end
                emission_probs = log(probs);
                
                
                %Keep a running total of the emmission probabilities as the
                %start point of the time window is moved back one step at a
                %time. This is the probability of seeing all the observations
                %in the analysis window in state j:
                
                if(emission_probs == 0 || isnan(emission_probs))
                    emission_probs =realmin;
                end
                
                
                %Find the total probability of transitioning from the last
                %state to this one, with the observations and being in the same
                %state for the analysis window. This is the duration-dependant
                %variation of equation 33a from Rabiner:
                %                 fprintf('log((duration_probs(j,d)./duration_sum(j))):%d\n',log((duration_probs(j,d)./duration_sum(j))));
                delta_temp = max_delta + (emission_probs)+ log((duration_probs(j,d)./duration_sum(j)));
                
                
                %Unlike equation 33a from Rabiner, the maximum delta could come
                %from multiple d values, or from multiple size of the analysis
                %window. Therefore, only keep the maximum delta value over the
                %entire analysis window:
                %If this probability is greater than the last greatest,
                %update the delta matrix and the time duration variable:
                
                
                if(delta_temp>delta(t,j))
                    delta(t,j) = delta_temp;
                    psi(t,j) = max_index;
                    psi_duration(t,j) = d;
                end
                
            end
        end
    end
end


%% Termination

% For the extended case, need to find max prob after end of actual
% sequence:

% Find just the delta after the end of the actual signal
temp_delta = delta(T+1:end,:);
%Find the maximum value in this section, and which state it is in:
[~, idx] = max(temp_delta(:));
[pos, ~] = ind2sub(size(temp_delta), idx);

% Change this position to the real position in delta matrix:
pos = pos+T;

%1) Find the last most probable state
%2) From the psi matrix, find the most likely preceding state
%3) Find the duration of the last state from the psi_duration matrix
%4) From the onset to the offset of this state, set to the most likely state
%5) Repeat steps 2 - 5 until reached the beginning of the signal


%The initial steps 1-4 are equation 34b in Rabiner. 1) finds P*, the most
%likely last state in the sequence, 2) finds the state that precedes the
%last most likely state, 3) finds the onset in time of the last state
%(included due to the duration-dependancy) and 4) sets the most likely last
%state to the q_t variable.

%1)
[~, state] = max(delta(pos,:),[],2);

%2)
offset = pos;
preceding_state = psi(offset,state);

%3)
% state_duration = psi_duration(offset, state);
onset = offset - psi_duration(offset,state)+1;

%4)
qt(onset:offset) = state;

%The state is then updated to the preceding state, found above, which must
%end when the last most likely state started in the observation sequence:
state = preceding_state;

count = 0;
%While the onset of the state is larger than the maximum duration
%specified:
while(onset > 2)
    
    %2)
    offset = onset-1;
    %     offset_array(offset,1) = inf;
    preceding_state = psi(offset,state);
    %     offset_array(offset,2) = preceding_state;
    
    
    %3)
    %     state_duration = psi_duration(offset, state);
    onset = offset - psi_duration(offset,state)+1;
    
    %4)
    %     offset_array(onset:offset,3) = state;
    
    if(onset<2)
        onset = 1;
    end
    qt(onset:offset) = state;
    state = preceding_state;
    count = count +1;
    
    if(count> 1000)
        break;
    end
end

qt = qt(1:T);