function [ raw, raw_header ] = pnSubsampleData(data, minutes, type, desiredFeats) %PNSUBSAMPLEDATA Subsample data by taking the worst value in X minutes % [ raw ] = pnSubsampleData(data, minutes) creates a cell array with columns % representing features and rows representing new observations. The first % row is a header row, and each feature is generated by subsampling the % data in structure 'data' using the worst value every 'minutes' minutes. % % % Inputs: % data - Structure with fields for each observation. Each % field then contains a cell array of the data in % standard format. % minutes - Time periods used to segment the data into (e.g., % 24*60 for one day) % type - Whether to pick the lowest or highest value in each % time period % 'low' (default) - Extract the lowest value in each time period % 'high' - Extract the highest value in each time period % desiredFeats - Optional cell array of strings specifying that only % a subset of features should be extracted, e.g., specify % {'Urine','pH'} to only extract subsampled data for % urine and pH. % % Outputs: % raw - Double matrix which is Nx(P*D) (N observations, D % features extracted, P time periods). Each column contains a single % value for each observation which were the lowest or highest for the % given time period. % raw_header - Header file indicating what the columns are in raw. % The header has a number appended indicating the first time value % for that period (e.g., pH_60 contains pH values after 60 minutes. % If followed later in header by pH_120, then pH_60 can be % understood as the lowest or highest pH value between 60 and 120 % minutes for each patient). % % % Example % %=== Load data in % load('data_processed_cell.mat'); % % %=== Choose features to extract from % header_extract = {'Urine','Platelets','BUN','Creatinine','PaO2'}; % % %=== Extract highest values over 48 hours % [high,header_H] = pnSubsampleData(data, 60*48,'high',header_extract); % Highest data for 48 hours % % See also PNLOADTEXTFILESCELL PNEXTRACTFIELD PNMAIN % References: % Physionet Challenge 2012 % http://physionet.org/challenge/2012/ % Copyright 2012 Alistair Johnson % $LastChangedBy: alistair $ % $LastChangedDate: 2012-04-24 22:39:24 +0100 (Tue, 24 Apr 2012) $ % $Revision: 342 $ % Originally written on GLNXA64 by Alistair Johnson, 25-Jan-2012 18:48:19 % Contact: alistairewj@gmail.com if nargin<4 desiredFeats = []; end if nargin<3 type = 'low'; end if isstruct(data) fn = fieldnames(data); Nfn = length(fn); else fn = data(:,1); Nfn = length(fn); end maxTime = 48*60; % Maximum time for data is 48 hours in the ICU nSamples = 0:minutes:maxTime; % Number of features to create for each meas. if nSamples(end)==maxTime % Do not include last value nSamples=nSamples(1:end-1); end %=== convert to cell array of strings sampStrings = arrayfun(@(x) ['_' num2str(x)], nSamples,'UniformOutput',false); %=== Create features for the 5 variables which do not change temporally fixedVars = {'RecordID','Age','Gender','Height','Weight'}; %=== Create a feature for each variable at each time point tempVars = {'ALP';'ALT';'AST';'Albumin';'BUN';'Bilirubin';'Cholesterol';'Creatinine';'DiasABP';'FiO2';'GCS';'Gender';'Glucose';'HCO3';'HCT';'HR';'Height';'K';'Lactate';'MAP';'MechVent';'Mg';'NIDiasABP';'NIMAP';'NISysABP';'Na';'PaCO2';'PaO2';'Platelets';'RecordID';'RespRate';'SaO2';'SysABP';'Temp';'TroponinI';'TroponinT';'Urine';'WBC';'Weight';'pH';}; %=== Remove unwanted features %=== If user specified certain features, remove the others if isempty(desiredFeats) %=== All features will be extracted else idxFixedRemove = true(size(fixedVars)); idxTemporalRemove = true(size(tempVars)); for k=1:numel(desiredFeats) idxFixed = strcmp(fixedVars,desiredFeats{k}); idxTemporal = strcmp(tempVars,desiredFeats{k}); if any(idxFixed) idxFixedRemove(idxFixed) = false; end if any(idxTemporal) idxTemporalRemove(idxTemporal) = false; end end %=== Remove unwanted features fixedVars(idxFixedRemove) = []; tempVars(idxTemporalRemove) = []; end temporalVars = strcat(repmat(tempVars,1,size(sampStrings,2)),... repmat(sampStrings,size(tempVars,1),1)); temporalVars = reshape(temporalVars,1,size(temporalVars,1)*size(temporalVars,2)); nVar = length(tempVars); %=== Create a cell array (pointer) for all patients if isstruct(data) patData = structfun(@(x) x(1:end,:),data,'UniformOutput',false); patData=struct2cell(patData); else patData = data(:,2:end); end %=== Pre-allocate raw = zeros(Nfn,length(fixedVars)+length(temporalVars)); raw_header = [fixedVars,temporalVars]; %=== First search for the 5 "fixed variables" in the first 20 data elements for v=1:length(fixedVars) %=== Scan for all entries %=== Extract indices of the variable desired varDataTemp = cellfun(@(x) strcmp(x,fixedVars{v}), patData(:,2),'UniformOutput',false); %=== Get indices of the fixed vars to remove from the data set idxRemove = cellfun(@(x) x==1, varDataTemp,'UniformOutput',false); %=== Find missing values, i.e. variable not found idxMissing = cellfun(@(x) any(x), idxRemove); idxUse=cell(size(idxMissing)); %=== Extract a single index to use for data % Only one data value will be used, but multiple entries may be % removed. For example, while there may be 3 entries for "Age", only % the final "Age" value is extracted if strcmp(fixedVars{v},'Weight') idxUse(idxMissing) = cellfun(@(x) find(x==1,1,'first'), idxRemove(idxMissing),'UniformOutput',false); idxRemove = cellfun(@removeFirstWeight, idxRemove, idxUse,'UniformOutput',false); % Only remove first weight value else idxUse(idxMissing) = cellfun(@(x) find(x==1,1,'last'), idxRemove(idxMissing),'UniformOutput',false); end %=== Add data to raw tmp = cellfun(@(x,y) x(y), patData(:,3), idxUse,'UniformOutput',false); raw(idxMissing,v) = cell2mat(tmp(idxMissing)); %=== Remove data from patData idxKeep = cellfun(@(x) ~x, idxRemove,'UniformOutput',false); for k=1:size(patData,2) patData(:,k) = cellfun(@(x,y) x(y), patData(:,k), idxKeep,'UniformOutput',false); end end %=== Cycle through each temporal split, create an index idxSamples=cell(Nfn,length(nSamples)); for t=1:(length(nSamples)-1) idxSamples(:,t) = cellfun(@(x) xnSamples(end),patData(:,1),'UniformOutput',false); %=== Calculate for each different split (e.g. every 24*60 minutes..) for t=1:length(nSamples) %=== For each time window, extract data relevant currData = cell(size(patData,1),2); currData(:,1) = cellfun(@(x,y) x(y), patData(:,2), idxSamples(:,t),'UniformOutput',false); % feature names currData(:,2) = cellfun(@(x,y) x(y), patData(:,3), idxSamples(:,t),'UniformOutput',false); % feature data %=== Then scan for each individual variable for v=1:length(tempVars) %=== Extract indices of the variable desired varDataTemp = cellfun(@(x) strcmp(x,tempVars{v}), currData(:,1),'UniformOutput',false); %=== Extract the value of each occurence of that variable in the window varDataTemp = cellfun(@(x,y) x(y)', currData(:,2), varDataTemp,'UniformOutput',false); %=== Replace each empty cell with NaN, the missing flag idxEmpty = cellfun(@(x) isempty(x), varDataTemp); varDataTemp(idxEmpty) = {NaN}; %=== Calculate index for raw idxData = (t-1)*nVar+v+numel(fixedVars); %=== *** SPECIAL CASE *** if strcmp(tempVars{v},'Urine') %=== Urine values should be summed varDataTemp = cellfun(@(x) sum(x,2),varDataTemp,'UniformOutput',false); end %=== Take minimum value as data to add too raw if strcmp(type,'low') raw(:,idxData) = cellfun(@(x) min(x,[],2), varDataTemp); else raw(:,idxData) = cellfun(@(x) max(x,[],2), varDataTemp); end end end end function [x] = removeFirstWeight(x,y) x=false(size(x)); x(y)=true; end