function [ data_processed ] = pnPreprocess(data, verboseFlag) %PNPREPROCESS Output data after preprocessing % [ data ] = pnPreprocess(data) % Primary variables: % data - Cell array data, rows = # of patients % dataDesc - Cell array of data fields/descriptions % dataFixed - Cell array of demographic data fields/descriptions % tmp - Data for a single parameter, e.g. 'HR' % idxRem - Index for data to be removed at end of the loop % idxManip - Index for data to be manipulated some how in switch block % high/low - Extracted data which is manipulated and re-imputed into tmp % Copyright 2012 Alistair Johnson % $LastChangedBy: alistair $ % $LastChangedDate: 2012-05-29 09:01:11 -0400 (Tue, 29 May 2012) $ % $Revision: 1 $ % Originally written on PCWIN64 by Alistair Johnson, 25-Apr-2012 01:37:49 % Contact: alistairewj@gmail.com if nargin<2 verboseFlag = false; end data_processed = data; [dataDesc,dataFixed] = pnDataDescriptions(); %=== Loop through the 3 fixed, demographic fields for k=1:size(dataFixed,1) fn = dataFixed{k,1}; if verboseFlag; fprintf('\n%%=== %s ===%%\n', fn); end; [tmp,idx] = pnExtractField(data_processed,fn); %=== Reset delete indices idxRem = []; switch fn case 'RecordID' continue; case 'Age' idxManip = cellfun(@(x) x>100, tmp(:,4),'UniformOutput',false); high = cellfun(@(x,y) x(y), tmp(:,4),idxManip,'UniformOutput',false); high = cellfun(@(x) x*0+105, high, 'UniformOutput',false); for m=1:size(tmp,1) tmp{m,4}(idxManip{m}) = high{m}; end if verboseFlag; fprintf('Replaced %2.0f values of 200 with 105.\n',sum(cell2mat(idxManip))); end; %=== Analyze residuals of data, check for bias % idxManip = cellfun(@(x) numel(x), tmp(:,4),'UniformOutput',false); % tmpDataAnalyze = cellfun(@(x,y) x(1:y-1) - x(y), tmp(:,4), idxManip, 'UniformOutput',false); % tmpDataAnalyze = cell2mat(tmpDataAnalyze(cellfun(@(x) ~isempty(x), tmpDataAnalyze))); % hist(tmpDataAnalyze,-10:1:10); xlabel('Age (1:end-1) - Age(end)'); idxManip = cellfun(@(x) [true(numel(x)-1,1);false], tmp(:,4),'UniformOutput',false); %=== Impute 0s for vector values tmp(:,4) = cellfun(@(x,y) x-x.*y, tmp(:,4), idxManip, 'UniformOutput',false); [idxRem,N] = pnPreprocessRemovalIndices(tmp, idx, 0, 'eq'); % index ==0 for removal if verboseFlag; fprintf('Deleted %2.0f vector values, leaving only the end value.\n',N); end; case 'Gender' idxManip = cellfun(@(x) (x==-1), tmp(:,4),'UniformOutput',false); high = cellfun(@(x,y) x(y), tmp(:,4),idxManip,'UniformOutput',false); high = cellfun(@(x) NaN, high, 'UniformOutput',false); for m=1:size(tmp,1) tmp{m,4}(idxManip{m}) = high{m}; end if verboseFlag; fprintf('Replaced %2.0f values which were -1 with NaN.\n',sum(cell2mat(idxManip))); end; case 'Height' % 1 centimetre = 0.393700787 inches % 1 foot = 30.48 centimetres % 1 inch = 2.54 centimetres idxManip = cellfun(@(x) x==-1, tmp(:,4),'UniformOutput',false); low = cellfun(@(x,y) x(y), tmp(:,4),idxManip,'UniformOutput',false); low = cellfun(@(x) NaN, low, 'UniformOutput',false); for m=1:size(tmp,1) tmp{m,4}(idxManip{m}) = low{m}; end if verboseFlag; fprintf('Replaced %2.0f values which were -1 with NaN.\n',sum(cell2mat(idxManip))); end; idxManip = cellfun(@(x) x<10, tmp(:,4),'UniformOutput',false); low = cellfun(@(x,y) x(y), tmp(:,4),idxManip,'UniformOutput',false); low = cellfun(@(x) x*100, low, 'UniformOutput',false); for m=1:size(tmp,1) tmp{m,4}(idxManip{m}) = low{m}; end if verboseFlag; fprintf('Multipled %2.0f values of <10 by 100 (1.8->180).\n',sum(cell2mat(idxManip))); end; idxManip = cellfun(@(x) x<25, tmp(:,4),'UniformOutput',false); low = cellfun(@(x,y) x(y), tmp(:,4),idxManip,'UniformOutput',false); low = cellfun(@(x) x*10, low, 'UniformOutput',false); for m=1:size(tmp,1) tmp{m,4}(idxManip{m}) = low{m}; end if verboseFlag; fprintf('Multipled %2.0f values of <25 by 10 (18->180).\n',sum(cell2mat(idxManip))); end; idxManip = cellfun(@(x) x<100, tmp(:,4),'UniformOutput',false); low = cellfun(@(x,y) x(y), tmp(:,4),idxManip,'UniformOutput',false); low = cellfun(@(x) x*2.54, low, 'UniformOutput',false); for m=1:size(tmp,1) tmp{m,4}(idxManip{m}) = low{m}; end if verboseFlag; fprintf('Multipled %2.0f values of <100 by 2.2 (81.8->180).\n',sum(cell2mat(idxManip))); end; idxManip = cellfun(@(x) x>1000, tmp(:,4),'UniformOutput',false); high = cellfun(@(x,y) x(y), tmp(:,4),idxManip,'UniformOutput',false); high = cellfun(@(x) x*0.1, high, 'UniformOutput',false); for m=1:size(tmp,1) tmp{m,4}(idxManip{m}) = high{m}; end if verboseFlag; fprintf('Multipled %2.0f values of >1000 by 0.1 (1800->180).\n',sum(cell2mat(idxManip))); end; idxManip = cellfun(@(x) x>250, tmp(:,4),'UniformOutput',false); high = cellfun(@(x,y) x(y), tmp(:,4),idxManip,'UniformOutput',false); high = cellfun(@(x) x/2.54, high, 'UniformOutput',false); for m=1:size(tmp,1) tmp{m,4}(idxManip{m}) = high{m}; end if verboseFlag; fprintf('Multipled %2.0f values of >250 by 0.3937 (400->157).\n',sum(cell2mat(idxManip))); end; end %=== Replace data with tmp for m=2:size(data,2) data_processed(:,m) = cellfun(@pnPreprocessReplaceData, data_processed(:,m), tmp(:,m), idx, 'UniformOutput', false); end %=== Delete entries flagged for deletion if ~isempty(idxRem) for m=2:size(data,2) data_processed(:,m) = cellfun(@pnPreprocessDeleteData, data_processed(:,m), idxRem, 'UniformOutput', false); end end end %=== Loop through all the fields, and process each appropriately for k=1:size(dataDesc,1) fn = dataDesc{k,1}; if verboseFlag; fprintf('\n%%=== %s ===%%\n', fn); [tmp,idx] = pnExtractField(data_processed,fn); idxRem=[]; switch fn case 'Albumin' if verboseFlag; fprintf('No preprocessing performed.\n'); end; case 'ALP' %=== Possible conversion errors in ALP, ALT, AST: % Listed as measured in IU % 1 Katal = 60,000,000 IU % 1 micro Katal = 60 IU % 1 nano Katal = 0.060 IU % *** micro kats are sometimes used. % Reference range: 30 to 120 IU/L if verboseFlag; fprintf('No preprocessing performed - Note: highly skewed. Dependent on age and gender.\n'); end; case 'ALT' % Reference range: 7 to 56 IU/L if verboseFlag; fprintf('No preprocessing performed.\n'); end; case 'AST' % Reference range: 5 to 40 IU/L % Very heavy tailed: Normal range 5 to 40 IU/L, 10%% data > 1000 IU/L. if verboseFlag; fprintf('No preprocessing performed.\n'); end; case 'Bilirubin' % Reference range: 0.2 to 1.2 mg/dL % 1 mg/dL == 17.1 µmol/L % It is possible to have >36 mg/dL in cases of transplant, etc. % Can't unilaterally fix this. if verboseFlag; fprintf('No preprocessing performed.\n'); end; case 'BUN' % Reference range: 10-20 mg/dl (3.6-71 mmol/liter) % 1 mg/dL == 0.357 mmol/L % Can't really convert anything here. [idxRem,N] = pnPreprocessRemovalIndices(tmp, idx, 0, 'eq'); if verboseFlag; fprintf('Deleted %2.0f values which were 0.\n',N); end; % for m=2:size(tmp,2) % tmp(:,m) = cellfun(@(x,y) x(y), tmp(:,m), idxManip,'UniformOutput',false); % end case 'Cholesterol' % Reference range: % Desirable <200 mg/dl <5.17 mmol/L % Borderline high 200-239 mg/dl 5.17-6.18 mmol/L % High ≥240 mg/dl ≥6.18 mmol/L % 1 mg/dL == 0.0259 mmol/L % Nothing bad here. if verboseFlag; fprintf('No preprocessing needed.\n'); end; case 'Creatinine' % Reference range: 0.6-1.3 mg/dl % <1.5 mg/dl (NEJM) % 1 mg/dL == 88.4 µmol/L %=== ~6 is a reasonable maximum, 7.6 is very confident % Seems OK. if verboseFlag; fprintf('No preprocessing needed.\n'); end; case 'DiasABP' %=== First, delete '0's since we don't know if it was badly % converted or missing [idxRem,N] = pnPreprocessRemovalIndices(tmp, idx, 0, 'eq'); if verboseFlag; fprintf('Deleted %2.0f values which were 0.\n',N); end; [idxRem2,N] = pnPreprocessRemovalIndices(tmp, idx, -1, 'eq'); if verboseFlag; fprintf('Deleted %2.0f values which were -1.\n',N); end; idxRem = cellfun(@(x,y) x | y, idxRem, idxRem2, 'UniformOutput',false); idxManip = cellfun(@(x) x>170, tmp(:,4),'UniformOutput',false); idxManip2 = cellfun(@(x) x<200, tmp(:,4),'UniformOutput',false); idxManip = cellfun(@(x) sum(x), idxManip); idxManip2 = cellfun(@(x) sum(~x), idxManip2); if verboseFlag; fprintf('There exist %2.0f values between 170-200, which were left as is.\n',sum(idxManip)-sum(idxManip2)); end; [idxRem2, N] = pnPreprocessRemovalIndices(tmp, idx, 200, 'gt'); [idxRem] = cellfun(@(x,y) x | y, idxRem, idxRem2, 'UniformOutput',false); if verboseFlag; fprintf('Deleted %2.0f values above 200.\n',N); end; case 'FiO2' if verboseFlag; fprintf('No preprocessing performed.\n'); end; case 'GCS' if verboseFlag; fprintf('No preprocessing performed.\n'); end; case 'Glucose' % Reference ranges: % Fasting % Normal 75-115 mg/dl 4.2-6.4 mmol/L % Diabetes mellitus >125 mg/dl >7.0 mmol/L % 2 Hr post-meal <120 mg/dl <6.7 mmol/L % 1 mg/dL == 0.0555 mmol/L if verboseFlag; fprintf('No preprocessing performed.\n'); end; case 'HCO3' % Reference ranges: 21-30 mEq/L 21-28 mmol/L % 1 mEq/L == 1 mmol/L if verboseFlag; fprintf('No preprocessing performed.\n'); end; case 'HCT' % Reference range: 41.0-53.0% if verboseFlag; fprintf('No preprocessing performed.\n'); end; case 'HR' [idxRem,N] = pnPreprocessRemovalIndices(tmp, idx, 1, 'lt'); if verboseFlag; fprintf('Deleted %2.0f values which were less than 1.\n',N); end; [idxRem2,N] = pnPreprocessRemovalIndices(tmp, idx, 299, 'gt'); idxRem = cellfun(@(x,y) x | y, idxRem, idxRem2, 'UniformOutput',false); % Combine removal indices if verboseFlag; fprintf('Deleted %2.0f values which above 299.\n',N); end; case 'K' % Reference ranges: 3.5-5.0 mEq/L 3.5-5.0 mmol/L % 1 mEq/L == 1 mmol/L if verboseFlag; fprintf('No preprocessing performed.\n'); end; case 'Lactate' % mmol/L % Reference ranges: 5-15 mg/dl 0.6-1.7 mmol/liter % 1 mg/dL == 0.111 mmol/L if verboseFlag; fprintf('No preprocessing performed.\n'); end; case 'Mg' % mmol/L % Reference ranges: 1.8-3.0 mg/dl 0.8-1.2 mmol/L %1 mg/dL == 0.411 mmol/L %1 mEq/L == 0.50 mmol/L if verboseFlag; fprintf('No preprocessing performed.\n'); end; case 'MAP' % mmHg [idxRem,N] = pnPreprocessRemovalIndices(tmp, idx, 1, 'lt'); if verboseFlag; fprintf('Deleted %2.0f values which were less than 1.\n',N); end; case 'MechVent' if verboseFlag; fprintf('No preprocessing performed.\n'); end; case 'Na' % mEq/L % Reference ranges: 136-145 mEq/L 136-145 mmol/L % 1 mEq/L == 1 mmol/L %=== Interesting spike at 150, possible rounding bias? if verboseFlag; fprintf('No preprocessing performed.\n'); end; case 'NIDiasABP' % mmHg [idxRem,N] = pnPreprocessRemovalIndices(tmp, idx, 1, 'lt'); if verboseFlag; fprintf('Deleted %2.0f values which were less than 1.\n',N); end; case 'NIMAP' % mmHg [idxRem,N] = pnPreprocessRemovalIndices(tmp, idx, 1, 'lt'); if verboseFlag; fprintf('Deleted %2.0f values which were less than 1.\n',N); end; case 'NISysABP' % mmHg % Values below 1?? [idxRem,N] = pnPreprocessRemovalIndices(tmp, idx, 1, 'eq'); if verboseFlag; fprintf('Deleted %2.0f values which less than 1.\n',N); end; case 'PaCO2' % mmHg % Reference range: % 4.7-6.0 kPa % 35-45 mmHg % 1 kPa == 7.5006 mmHg idxManip = cellfun(@(x) x<10 & x>1, tmp(:,4),'UniformOutput',false); low = cellfun(@(x,y) x(y), tmp(:,4),idxManip,'UniformOutput',false); low = cellfun(@(x) x*10, low, 'UniformOutput',false); if verboseFlag; pnPreprocessPrintReplacedValues(tmp(:,4),low,idxManip); end; for m=1:size(tmp,1) tmp{m,4}(idxManip{m}) = low{m}; end if verboseFlag; fprintf('Replaced %2.0f values between [1,10] which were incorrectly recorded.\n',sum(cell2mat(idxManip))); end; % Values below 1 must be wrong... [idxRem,N] = pnPreprocessRemovalIndices(tmp, idx, 1, 'lt'); if verboseFlag; fprintf('Deleted %2.0f values which were less than 1.\n',N); end; % idxManip = cellfun(@(x) x<1, tmp(:,4),'UniformOutput',false); % low = cellfun(@(x,y) x(y), tmp(:,4),idxManip,'UniformOutput',false); % low = cellfun(@(x) x*100, low, 'UniformOutput',false); % % for m=1:size(tmp,1) % tmp{m,4}(idxManip{m}) = low{m}; % end case 'PaO2' % mmHg % Reference range: % 11-13 kPa % 75-100 mmHg % 1 kPa == 7.5006 mmHg idxManip = cellfun(@(x) x<20 & x>1, tmp(:,4),'UniformOutput',false); low = cellfun(@(x,y) x(y), tmp(:,4),idxManip,'UniformOutput',false); low = cellfun(@(x) x*10, low, 'UniformOutput',false); if verboseFlag; pnPreprocessPrintReplacedValues(tmp(:,4),low,idxManip); end; for m=1:size(tmp,1) tmp{m,4}(idxManip{m}) = low{m}; end if verboseFlag; fprintf('Replaced %2.0f values between [1,20] which were incorrectly recorded.\n',sum(cell2mat(idxManip))); end; [idxRem,N] = pnPreprocessRemovalIndices(tmp, idx, 1, 'lt'); if verboseFlag; fprintf('Deleted %2.0f values which were less than 1.\n',N); end; case 'pH' % Reference range: % 7.34-7.45 units %=== This is a mess % There are 5 values between 94-100, not sure why. Probably put % in the wrong field? Doesn't match with an [H+] conversion idxManip = cellfun(@(x) (x>0.65 & x<0.8), tmp(:,4),'UniformOutput',false); low = cellfun(@(x,y) x(y), tmp(:,4),idxManip,'UniformOutput',false); low = cellfun(@(x) x*7.5006, low, 'UniformOutput',false); for m=1:size(tmp,1) tmp{m,4}(idxManip{m}) = low{m}; end if verboseFlag; fprintf('Multiplied %2.0f values by 10 (0.7->7).\n',sum(cell2mat(idxManip))); end; idxManip = cellfun(@(x) (x>65 & x<80), tmp(:,4),'UniformOutput',false); low = cellfun(@(x,y) x(y), tmp(:,4),idxManip,'UniformOutput',false); low = cellfun(@(x) x*0.1, low, 'UniformOutput',false); for m=1:size(tmp,1) tmp{m,4}(idxManip{m}) = low{m}; end if verboseFlag; fprintf('Multiplied %2.0f values by 0.1 (70->7).\n',sum(cell2mat(idxManip))); end; idxManip = cellfun(@(x) (x>650 & x<800), tmp(:,4),'UniformOutput',false); low = cellfun(@(x,y) x(y), tmp(:,4),idxManip,'UniformOutput',false); low = cellfun(@(x) x*0.01, low, 'UniformOutput',false); for m=1:size(tmp,1) tmp{m,4}(idxManip{m}) = low{m}; end if verboseFlag; fprintf('Multiplied %2.0f values by 0.01 (700->7).\n',sum(cell2mat(idxManip))); end; %=== Values <6.5 and >0.8 are deleted % Note: (there are no values around 0.6-0.75) [idxRem] = pnPreprocessRemovalIndices(tmp, idx, 0.8, 'gt'); [idxRem2] = pnPreprocessRemovalIndices(tmp, idx, 6.5, 'lt'); idxRem = cellfun(@(x,y) x & y, idxRem, idxRem2, 'UniformOutput',false); N = sum(cell2mat(cellfun(@(x) sum(x), idxRem, 'UniformOutput', false))); if verboseFlag; fprintf('Deleted %2.0f values which were between [0.8,6.5].\n',N); end; %=== Values >80 & <650 are deleted % Note: (there are no values around 0.6-0.75) [idxRem3] = pnPreprocessRemovalIndices(tmp, idx, 80, 'gt'); [idxRem4] = pnPreprocessRemovalIndices(tmp, idx, 650, 'lt'); idxRem = cellfun(@(x,y,z) (x & y) | z, idxRem3, idxRem4, idxRem, 'UniformOutput',false); N = sum(cell2mat(cellfun(@(x,y) sum(x&y), idxRem3, idxRem4, 'UniformOutput', false))); if verboseFlag; fprintf('Deleted %2.0f values which were between [80,650].\n',N); end; case 'Platelets' % cells/nL % Reference range: 150-350 (10e3)/mm^3 % 1 (10e3)/µL == 1 (10e3)/mm^3 == 1/nL % Errors are probably in orders of 1000 if verboseFlag; fprintf('No preprocessing performed.\n'); end; case 'RespRate' % bpm [idxRem,N] = pnPreprocessRemovalIndices(tmp, idx, 1, 'lt'); if verboseFlag; fprintf('Deleted %2.0f values which were less than 1.\n',N); end; case 'SaO2' % % % Reference range: 94-100 if verboseFlag; fprintf('No preprocessing performed.\n'); end; case 'SysABP' % mmHg % Reference range: % 10-14 kPa % 75-105 mmHg % 1 kPa == 7.5006 mmHg [idxRem,N] = pnPreprocessRemovalIndices(tmp, idx, 1, 'lt'); if verboseFlag; fprintf('Deleted %2.0f values which were less than 1.\n',N); end; case 'Temp' %=== Check for Farenheit measurements and incorrect conversions %=== There are 129 values which are too low... %=== Assume some of the values are artefacts %=== INCORRECTLY CONVERTED F->C % 1x range should be ~ [1.5,8], use (1,10) idxManip = cellfun(@(x) x<10 & x>1, tmp(:,4),'UniformOutput',false); low = cellfun(@(x,y) x(y), tmp(:,4),idxManip,'UniformOutput',false); low = cellfun(@(x) x*9/5+32, low, 'UniformOutput',false); %=== spooky action at a distance for m=1:size(tmp,1) tmp{m,4}(idxManip{m}) = low{m}; end % 2x range should be ~ [-16.8,-13], use (-17,-13) idxManip2 = cellfun(@(x) x<=-13 & x>-17, low, 'UniformOutput',false); low = cellfun(@(x,y) x(y), tmp(:,4),idxManip2,'UniformOutput',false); low = cellfun(@(x) (x*9/5+32)*9/5+32, low, 'UniformOutput',false); %=== spooky action at a distance for m=1:size(tmp,1) tmp{m}(idxManip2{m}) = low{m}; end %=== RECORDED AS F % 1x range should be ~ [95,113] idxManip3 = cellfun(@(x) x>90 & x<120, tmp(:,4),'UniformOutput',false); low = cellfun(@(x,y) x(y), tmp(:,4),idxManip3,'UniformOutput',false); low = cellfun(@(x) (x-32)*5/9, low, 'UniformOutput',false); %=== spooky action at a distance for m=1:size(tmp,1) tmp{m}(idxManip3{m}) = low{m}; end %=== DELETING VALUES NOW ===% %=== Delete 0s and values below -17s (0s and converted 0s) [idxRem,N] = pnPreprocessRemovalIndices(tmp, idx, 0, 'eq'); if verboseFlag; fprintf('Deleted %2.0f values which were equal to 0.\n',N); end; [idxRem2,N] = pnPreprocessRemovalIndices(tmp, idx, -17, 'lt'); if verboseFlag; fprintf('%2.0f of those values were -17.8 (i.e. incorrectly converted from 0).\n',N); end; %=== Negative values which don't seem erroneously converted [idxRem3] = pnPreprocessRemovalIndices(tmp, idx, -13, 'ge'); [idxRem4] = pnPreprocessRemovalIndices(tmp, idx, 1, 'le'); idxRem3 = cellfun(@(x,y) x & y, idxRem3, idxRem4, 'UniformOutput',false); N = sum(cell2mat(cellfun(@(x) sum(x), idxRem3, 'UniformOutput', false))); if verboseFlag; fprintf('Deleted %2.0f values which were between [-13,1].\n',N); end; %=== Positive values which don't seem erroneously converted [idxRem4] = pnPreprocessRemovalIndices(tmp, idx, 50, 'ge'); [idxRem5] = pnPreprocessRemovalIndices(tmp, idx, 90, 'le'); idxRem4 = cellfun(@(x,y) x & y, idxRem4, idxRem5, 'UniformOutput',false); N = sum(cell2mat(cellfun(@(x) sum(x), idxRem4, 'UniformOutput', false))); if verboseFlag; fprintf('Deleted %2.0f values which were between [50,90].\n',N); end; [idxRem5,N] = pnPreprocessRemovalIndices(tmp, idx, 119, 'gt'); if verboseFlag; fprintf('Deleted %2.0f values which were between above 119.\n',N); end; idxRem = cellfun(@(l,m,f,a,o) l | m | f | a | o, idxRem, idxRem2, idxRem3, idxRem4, idxRem5, 'UniformOutput',false); if verboseFlag; fprintf('Replaced %2.0f values which were incorrectly converted F->C.\n',sum(cell2mat(idxManip))); end; if verboseFlag; fprintf('Replaced %2.0f values which were incorrectly converted F->C twice.\n',sum(cell2mat(idxManip2))); end; if verboseFlag; fprintf('Replaced %2.0f values which were incorrectly recorded as F.\n',sum(cell2mat(idxManip2))); end; case 'TroponinI' % Reference ranges: 0-0.4 µg/L % Could be off by orders of 10 idxManip = cellfun(@(x) (x>30), tmp(:,4),'UniformOutput',false); low = cellfun(@(x,y) x(y), tmp(:,4),idxManip,'UniformOutput',false); low = cellfun(@(x) x*0.1, low, 'UniformOutput',false); for m=1:size(tmp,1) tmp{m,4}(idxManip{m}) = low{m}; end if verboseFlag; fprintf('Multiplied %2.0f values by 0.11 (30->0.3).\n',sum(cell2mat(idxManip))); end; case 'TroponinT' % Reference ranges: 0-0.1 µg/L % Could be off by orders of 10. if verboseFlag; fprintf('No preprocessing performed.\n'); end; case 'Urine' if verboseFlag; fprintf('No preprocessing performed.\n'); end; case 'WBC' % cells/nL % Reference range: 4.5-11.0 % 1 (10^3)/µL == 1 (10^9)/L [idxRem,N] = pnPreprocessRemovalIndices(tmp, idx, 1, 'lt'); idxManip = cellfun(@(x) x<1 & x>0, tmp(:,4),'UniformOutput',false); low = cellfun(@(x,y) x(y), tmp(:,4),idxManip,'UniformOutput',false); low = cellfun(@(x) x*10, low, 'UniformOutput',false); if verboseFlag; pnPreprocessPrintReplacedValues(tmp(:,4),low,idxManip); end; if verboseFlag; fprintf('Deleted %2.0f values which were less than 1.\n',N); end; case 'Weight' % kg % Reference range: % 86.6 kg (190.9 lb) (males) % 74.4 kg (164.0 lb) (females) % 1 kilogram = 2.20462262 pounds, 1 pound = 0.45359237 kilograms % 1 kilogram = 0.15747304 stones, 1 stone = 6.35029318 kilograms %=== Impute NaN in the first weight, delete the others idxManip = cellfun(@(x) (x==-1), tmp(:,4),'UniformOutput',false); low = cellfun(@(x,y) x(y), tmp(:,4),idxManip,'UniformOutput',false); low = cellfun(@(x) NaN, low, 'UniformOutput',false); for m=1:size(tmp,1) tmp{m,4}(idxManip{m}) = low{m}; end if verboseFlag; fprintf('Replaced %2.0f values which were -1 with NaN.\n',sum(cell2mat(idxManip))); end; [idxRem,N] = pnPreprocessRemovalIndices(tmp, idx, 0, 'eq'); if verboseFlag; fprintf('Deleted %2.0f values which were 0.\n',N); end; %=== The following values continually pop up: % 0.6 is on patient 3889 (subid 142393) % Their weight is constant at 70, then becomes 0.6 at 467 min [idxRem2,N] = pnPreprocessRemovalIndices(tmp, idx, 35, 'lt'); if verboseFlag; fprintf('Deleted %2.0f values which were less than 35.\n',N); end; [idxRem3,N] = pnPreprocessRemovalIndices(tmp, idx, 299, 'gt'); if verboseFlag; fprintf('Deleted %2.0f values which were greater than 299.\n',N); end; idxRem = cellfun(@(f,a,t) f | a | t, idxRem, idxRem2, idxRem3, 'UniformOutput',false); end %=== Replace data with tmp for m=2:size(data,2) data_processed(:,m) = cellfun(@pnPreprocessReplaceData, data_processed(:,m), tmp(:,m), idx, 'UniformOutput', false); end %=== Remove deleted data from 'data' if ~isempty(idxRem) for m=2:size(data,2) data_processed(:,m) = cellfun(@pnPreprocessDeleteData, data_processed(:,m), idxRem, 'UniformOutput', false); end end end end end function [data] = pnPreprocessDeleteData(data,idx) %PNPREPROCESSDELETEDATA Deletes data in "data" at idx (used with cellfun) % [ data ] = pnPreprocessDeleteData(data,idx) % % % Inputs: % data - Vector of data % idx - Index for data to be removed % % Outputs: % data - Data with elements at idx deleted % % % Example % data(:,m) = cellfun(@pnPreprocessDeleteData, data(:,m), idx, 'UniformOutput', false); % % See also PNPREPROCESSDATA % Copyright 2012 Alistair Johnson data(idx)=[]; end function [idxOut, N] = pnPreprocessRemovalIndices(tmp, idx, val, equalityFcnStr) %PNPREPROCESSDELETEDATA Generates indices to be used to remove data % [idxOut] = pnPreprocessRemovalIndices(tmp, idx, val, equalityFcnStr) % calculates the indices of data which satisfies some condition % specified by equalityFcnStr and val. For example, if equalityFcnStr % is 'eq' and val is 0, then the function locates the indices of tmp % which contain data equal to 0. These indices are then translated % into indices in the original data cell, to be used for data % deletion at a later point in PNPREPROCESSDATA. % % [idxOut,N] = pnPreprocessRemovalIndices(tmp, idx, val, equalityFcnStr) % also outputs the number of entries being deleted. % % Inputs: % tmp - Cell array with data only from a given field % idx - Indices that were used to extract tmp from the original % data cell array % val - A value used for comparison with tmp % equalityFcnStr - The function used to compare val to data in tmp % % Outputs: % idxOut - Indices of data cell array to be deleted % N - Number of entries to be deleted % % Example: % data = pnLoadTextFilesCell([bpath 'set-a']); % [tmp,idx] = pnExtractField(data,'HR'); % [idxRem,N] = pnPreprocessRemovalIndices(tmp, idx, 0, 'eq'); % for m=2:4 % data(:,m) = cellfun(@pnPreprocessDeleteData,... % data(:,m), idxRem, 'UniformOutput', false); % end % fprintf('Deleted %2.0f values which were 0.\n',N); % % % See also PNPREPROCESSDATA % Copyright 2012 Alistair Johnson fcnTmp = str2func(equalityFcnStr); idxRem = cellfun(@(x) fcnTmp(x,val), tmp(:,4),'UniformOutput',false); idxOut = cellfun(@pnSubfcnRemIdx, idx, idxRem, 'UniformOutput',false); N = sum(cell2mat(cellfun(@(x) sum(x), idxOut, 'UniformOutput', false))); end function [idx] = pnSubfcnRemIdx(idx, idxRem) idx(idx) = idxRem; end function [data] = pnPreprocessReplaceData(data,tmp,idx) %PNPREPROCESSREPLACEDATA Replace data in "data" with "tmp" % [data] = pnPreprocessReplaceData(data,tmp,idx) replaces values in data % with values in tmp using indices idx to map tmp to data. This function % is used with cellfun in PNPREPROCESSDATA. % % % Inputs: % data - Vector of data % tmp - Vector of data after some form of preprocessing % idx - Indices mapping tmp into data % % Outputs: % data - Vector of data with tmp imputed within % % % Example % data(:,m) = cellfun(@pnReplaceData, data, tmp, idx, 'UniformOutput', false); % % See also PNPREPROCESSDATA data(idx)=tmp; end function [] = pnPreprocessPrintReplacedValues(orig,new,idx) %PNPREPROCESSPRINTREPLACEDVALUES Prints values which were modified to %command window. % pnPreprocessReplaceData(new,orig,idx) prints values replaced in the % given preprocessing step followed by their new value % % Inputs: % new - Vector of data after some form of preprocessing % orig - Vector of data % idx - Indices mapping tmp into data % % Outputs: % List of values which were changed % % Example % data(:,m) = cellfun(@pnReplaceData, data, tmp, idx, 'UniformOutput', false); % % See also PNPREPROCESSDATA %=== Extract cell indices which contain changed data idxChanged = find(cellfun(@(x) ~isempty(x) && any(x), idx)==1); for k=1:numel(idxChanged) idxChangedTemp = find(idx{idxChanged(k)}==1); for m=1:numel(idxChangedTemp) fprintf('%3.2f --> %3.2f\n',... orig{idxChanged(k)}(idxChangedTemp(m)),... new{idxChanged(k)}(m)); end end end