%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%This scripts is used to process LC-MS data for the pancreatics study
% by John koomen etc @ MD.Anderson .
% A series of preprocessing steps were employed to produce an expansive
% list of peptides for further investigation and sequencing. These steps
% included spectral alignment, normalization, identifying of local maxima,
% further identifying "large" maxima as peaks, and looking for signs of
% differential expression.
% copyright @MD Anderson Department of Biostatistics and Mathmatics
% Keith.A.Baggerly and Haitao Zhao
% if you have any questions please send email to hzhao@mdanderson.org
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% Load in the clinical info, and sample ids
% There are 23 samples total including control,cancer and chemo patients
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
td70.sample_info = {
'106', 'Chemo',...
'106a', 'Chemo',...
'120', 'Chemo',...
'120b', 'Chemo',...
'123', 'Cancer',...
'126', 'Cancer',...
'131', 'Chemo',...
'151', 'Chemo',...
'151a', 'Chemo',...
'166', 'Chemo',...
'172', 'Cancer',...
'222', 'Cancer',...
'224', 'Cancer',...
'300', 'Control',...
'300a', 'Control',...
'314', 'Control',...
'315a', 'Control',...
'336', 'Control',...
'339', 'Control',...
'349', 'Control',...
'350', 'Control',...
'352', 'Cancer',...
'355', 'Control',...
};
td70.n_samples = 23;
td70.sample_info = reshape(td70.sample_info,2,td70.n_samples)';
td70.normal = find(strcmp('Control',td70.sample_info(:,2)));
td70.cancer = find(strcmp('Cancer',td70.sample_info(:,2)));
td70.chemo = find(strcmp('Chemo',td70.sample_info(:,2)));
td70.gcancer=[td70.cancer' td70.chemo']';
td70.new_ord = [td70.normal' td70.cancer' td70.chemo']';
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% end Load in the clinical info, and sample ids
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% load in the data
% interpolate intensities to a common mass vector
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% First define the masses that we'll use. Across all the spectras
% we define the full length spectra as the common mass vector.
% the spectra length less than this vector will align to it
% after the interpolate all the spectra will have the same length(26312
% in this case)
td70.file_names = dir('Raw Spectra');
td70.file_names = td70.file_names(4:end);
Full_Spectra_Size=26312;
for(i1 = 1:td70.n_samples)
mz_vec=zeros(Full_Spectra_Size,150);
raw_data=zeros(Full_Spectra_Size,150);
spectra_size=zeros(150,1);
fid = fopen(['Raw Spectra\' td70.file_names(i1).name]);
for (i=1:150)
[Spectra_info,count] = fscanf( fid,'%*s %d %*s %*s %d %*s %*s %f',3);
if (count==0)
break;
end
Spectra_num=Spectra_info(1);
Spectra_size=Spectra_info(2);
[temp,count] = fscanf(fid,'%f%f',[2,Spectra_size]);
temp=temp';
mz_vec(1:Spectra_size,Spectra_num)=temp(:,1);
raw_data(1:Spectra_size,Spectra_num)=uint16(temp(:,2));
spectra_size(i,1)=Spectra_size;
end
clear Spectr_num Spectra_time Spectra_size temp;
partindex=find(spectra_size~= Full_Spectra_Size);
fullindex=find(spectra_size == Full_Spectra_Size);
if(max(spectra_size)==Full_Spectra_Size)
mz_mat=mz_vec(:,fullindex(1));
end
raw_data2=zeros(Full_Spectra_Size,150);
for(i=1:length(partindex))
[inter,reptindexi,reptindexz]=intersect(mz_vec(:,partindex(i)),mz_mat);
raw_data2(reptindexz,partindex(i))=raw_data(reptindexi,partindex(i));
end
if(max(spectra_size)==Full_Spectra_Size)
raw_data2(:,fullindex)=raw_data(:,fullindex);
end
raw_data2=uint16(raw_data2);
raw_spectra(:,:,i1)=raw_data2;
clear raw_data mz_vec raw_data2 partindex fullindex reptindexi reptindexz inter spectra_size savename;
end
td70.raw_spectra=raw_spectra;
td70.mz_vec=mz_mat;
clear raw_spectra,mz_mat;
% end Load in data from txt file
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% To decide which time period we should select for analysis
% we generate the TIC (total ion current) as a reference.
% generate the TIC pic for each sample
% sum the TIC for each elution time across the sampless
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
tic=zeros(150,td70.n_samples);
for (i=1:td70.n_samples)
for (j=1:150)
tic(j,i)=sum(double(td70.raw_spectra(:,j,i)));
end
end
sumtic=sum(tic,1);
sumtic=sumtic';
td70.tic=tic;
clear tic i j Im savename;
end
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% from the TIC pic we select the domain we interested in. Due to the nature
% of the LC , there will be offset from sample to sample we use TIC(mainly
% the injection artifact) to manualy aligh the samples,after alignment,
% Normalize the data according to the ratio in sumtic.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
adjust=td70.raw_spectra(:,21:100,:);
adjust=sum(adjust,2);
adjust=squeeze(adjust);
td70.sample_scaling = mean(adjust);
target = median(td70.sample_scaling);
td70.sample_scaling = target ./ td70.sample_scaling;
for(i=1:td70.n_samples)
td70.raw_spectra(:,:,i)=double(td70.raw_spectra(:,:,i)).*td70.sample_scaling(i);
end
clear target adjust;
%%%%%%%%%%%%%%%%%%%%%%%%%%
% Normalize to total ion current
%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% As a prelude to identifying peaks, we identified local maxima in each
% spectrum, recorded intensities at the maximum and the index of the maximum
% within the vector. The maxima were then matched by m/z across spectra;
% the matching window identified maxima differing in location from the first
% by less than 0.5 Da. thus we got a peak list for each sample in each time
% period.
%%%%%%%%%%%%%%%%%%s%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
time_period=80;
td70.magic.max_Dalton_width = 50;
max_Dalton_width = td70.magic.max_Dalton_width;
td70.magic.max_num_peaks = 1500;
max_num_peaks = td70.magic.max_num_peaks; % magic; we have about 1500 Daltons here
local_Dalton_width = zeros(26312,td70.n_samples);
for(i1 = 1:length(local_Dalton_width)-max_Dalton_width)
local_Dalton_width(i1) = ...
sum(td70.mz_vec(i1:i1+max_Dalton_width) < td70.mz_vec(i1)+1);
end
raw_peaks = zeros(max_num_peaks,time_period,td70.n_samples) ;
raw_peaks=uint16(raw_peaks);
peak_index=zeros(max_num_peaks,time_period);
peak_number=zeros(time_period,1);
std_sample=19;
for k=(1:time_period)
% Next, look for the maxima in the first spectrum.
% When a maximum is found, record its intensity and index.
spec_1_peaks = zeros(max_num_peaks,1);
n_peaks = 0;
for(i1 = max_Dalton_width:length(local_Dalton_width)-max_Dalton_width)
half_local_Dalton_width = floor((local_Dalton_width(i1)/2) - 1);
if((td70.raw_spectra(i1,k,std_sample) == max(td70.raw_spectra(i1-half_local_Dalton_width:i1+half_local_Dalton_width,k,std_sample))))
if(n_peaks > 0)
if(i1 > spec_1_peaks(n_peaks,1) + half_local_Dalton_width)
n_peaks = n_peaks+1;
spec_1_peaks(n_peaks,1) = i1;
end
else
n_peaks = n_peaks+1;
spec_1_peaks(n_peaks,1) = i1;
end
end
end
% Having found the peaks in the first spectrum, find the maxima
% in the corresponding locations (allowing for offset) in the
% other spectra
all_spec_at_spec_1_peaks = zeros(max_num_peaks,1,td70.n_samples);
for(i1 = 1:n_peaks)
half_local_Dalton_width = ...
floor( (local_Dalton_width(spec_1_peaks(i1,1))/2) - 1 );
all_spec_at_spec_1_peaks(i1,1,:) = ...
max(td70.raw_spectra(...
spec_1_peaks(i1,1)-half_local_Dalton_width:...
spec_1_peaks(i1,1)+half_local_Dalton_width,k,:),[],1);
end
raw_peaks(:,k,:) = all_spec_at_spec_1_peaks;
peak_index(:,k)=spec_1_peaks(:,1);
peak_number(k)=n_peaks;
clear spec_1_peaks n_peaks;
end
td70.peaks=raw_peaks;
td70.peak_index=peak_index;
td70.peak_number=peak_number;
clear half_local_Dalton_width local_Dalton_width max_Dalton_width;
clear max_num_peaks all_spec_at_spec_1_peaks raw_peaks peak_number peak_index;
clear n_peaks max_Dalton_width local_Dalton_width i1 j k;
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% As most of the maxima are due to chemical noise, we wanted to identify only
% the largest as peaks. To do this, we used a rough analog of the signal to
% noise ratio. First, we used a sliding window of 90 sequential maxima along
% the m/z vector and also +/- 20s elution time; the value of 90 was chosen
% so that more than 2/3 of the maxima in any window were, in our assessment,
% chemical noise. The 90 maxima values were sorted, and the distance Di between
% the 30th and 60th value was taken as a robust measure of the spread in maxima
% heights to be expected in the absence of structure. More than 5 Di in at least
% two spectra indentified a maximum as a “peak” if it was greater than the median
% intensity in the 90 maxima window
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
td70.magic.peak_span = 90;
n_wide = td70.magic.peak_span;
time_period=80;
key_peaks_index=zeros(100,time_period);
key_peaks_number=zeros(time_period,1);
for(k=1:time_period)
is_real_peak = zeros(size(td70.peaks,1),1);
for(i1 = 1:size(td70.peaks,1)-n_wide-1)
n1 = i1;
n2 = n1 + n_wide-1;
fraction=double(td70.peaks(n1:n2,k,:));
fraction=squeeze(fraction);
sort_all = sort(fraction);
mid_width = sort_all(floor(2*n_wide/3),:) - ...
sort_all(floor(n_wide/3),:);
mid_width=mid_width+1;
scale_all = sort_all ./ repmat(mid_width,n_wide,1);
big_peaks = ((fraction) - ...
repmat(sort_all(floor(n_wide/2),:),n_wide,1)) ./ ...
repmat(mid_width, n_wide,1);
is_real_peak(n1:n2) = is_real_peak(n1:n2) + (sum(big_peaks' > 5) > 2)';
end
for(i1 = 1:n_wide-1)
is_real_peak(i1) = is_real_peak(i1)*(n_wide/i1);
is_real_peak(end-i1+1) = is_real_peak(end-i1+1)*(n_wide/i1);
end
temp = find(is_real_peak > 60);
key_peaks_number(k,1)=length(temp);
key_peaks_index(1:length(temp),k)=temp;
clear is_real_peak i1 big_peaks mid_width;
clear n1 n2 sort_all scale_all temp;
end
clear time_period n_wide;
%%%%%%%%%%%%%%%%%%%%%%%%%%
% look for the really big peaks in the spectrum,
% in order to focus just on those
%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%
% Having identified peaks we next focused on those showing the most
% differential in their intensities according to T-statistics. We
% applied two sample t-tests contrasting each of 2 subsets of the disease
% group (all Cancer, chemo) to the control group and flagged peaks with
% t-values greater than 3, indicating higher intensities in the disease groups.
% Compute two-sample t-statistics
%%%%%%%%%%%%%%%%%%%%%%%%%%
time_period=80;
peak_number=159;
max_peak_number=max(peak_number);
td70.peaks=td70.peaks(1:max_peak_number,:,:);
chemo_vs_normal=zeros(max_peak_number,time_period);
cancer_vs_normal=zeros(max_peak_number,time_period);
for(k=1:time_period)
chemo_vs_normal(1:peak_number(i1),i1) = ...
((mean(td70.peaks(1:key_peaks_number(i1),td70.chemo,i1)') - ...
mean(td70.peaks(1:key_peaks_number(i1),td70.normal,i1)')) ./ ...
sqrt(var(td70.peaks(1:key_peaks_number(i1),td70.chemo,i1)')/...
length(td70.chemo) + ...
var(td70.peaks(1:key_peaks_number(i1),td70.normal,i1)')/...
length(td70.normal)))';
cancer_vs_normal(1:peak_number(i1),i1) = ...
((mean(td70.peaks(1:key_peaks_number(i1),td70.cancer,i1)') - ...
mean(td70.peaks(1:key_peaks_number(i1),td70.normal,i1)')) ./ ...
sqrt(var(td70.peaks(1:key_peaks_number(i1),td70.cancer,i1)')/...
length(td70.cancer) + ...
var(td70.peaks(1:key_peaks_number(i1),td70.normal,i1)')/...
length(td70.normal)))';
end
td70.t.cancer_vs_normal=cancer_vs_normal;
td70.t.chemo_vs_normal=chemo_vs_normal;
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% Compute two-sample t-statistics
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%