function[stats_and_pvals] = bioinf_sim_fn(n,p_A,p_B_mult,... n_A,n_B,k_over,n_sim) % BIOINF_SIM_FN(N,P_A,P_B_MULT,N_A,N_B,K_OVER,N_SIM) % This function simulates test statistic results % for the significance of SAGE differential expression % assuming that the underlying model is a beta-binomial. % Inputs are % N -- the common size of all of the libraries involved % P_A -- the true population proportion for group A % P_B_MULT -- the population proportion for group B, as % a multiple of the propn in group A % N_A -- the number of libraries in group A % N_B -- the number of libraries in group B % K_OVER -- the amount of overdispersion; the variance % is taken to be k_over*n*p*(1-p). % N_SIM -- The number of simulations to run. % % The function returns an N_SIM by 8 matrix. The 8 columns % are % chi^2, p_val, two-sample t, df, p_val, t_w, df, p_val. % Last updated Dec 11, 02 % Keith Baggerly, kabagg@mdanderson.org % Planned updates: letting n be a vector of library sizes % rather than a single constant. %%%%%%%%%%%%%%%%%%%%%%%% % Set a few constants %while 0 %%%%%%%%%%%%%%%%%%%%%%%% p_B = p_B_mult * p_A; % the mean proportion for group 2 % Compute the beta parameters required to inflate the % variance by a factor of k_over. This uses the fact that % V(X) = {ab/[(a+b)(a+b+1)]}{[n^2/(a+b)] + n} % = np(1-p)[(a+b)/(a+b+1)]{[n/(a+b)] + 1}, so % k_over = [c/(c+1)][(n/c) + 1], where c = a+b. Rearranging, % k_over(c+1) = n + c, so alpha_plus_beta = (n-k_over)/(k_over-1); % sum of the beta parameters alpha_A = p_A * alpha_plus_beta; beta_A = alpha_plus_beta - alpha_A; alpha_B = p_B * alpha_plus_beta; beta_B = alpha_plus_beta - alpha_B; p_A_mat = betarnd(alpha_A,beta_A,n_sim,n_A); p_B_mat = betarnd(alpha_B,beta_B,n_sim,n_B); x_A_mat = poissrnd(n*p_A_mat); % This is faster than binornd x_B_mat = poissrnd(n*p_B_mat); stats_and_pvals = zeros(n_sim,8); %%%%%%%%%%%%%%%%%%%%%%%% %end % Set a few constants %%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%% % Processing: Compute chi-squared values %while 0 %%%%%%%%%%%%%%%%%%%%%%%%%%%% sage2sum = [sum(x_A_mat')' sum(x_B_mat')']; chivals = zeros(n_sim,1); % This requires organizing the data as entries in a % 2x2 contingency table. N_1 = n_A * n; N_2 = n_B * n; N__ = N_1 + N_2; N11 = sage2sum(:,1); N12 = sage2sum(:,2); N21 = N_1 - N11; N22 = N_2 - N12; N1_ = N11 + N12; N2_ = N21 + N22; Nzero = (N1_ == 0); chivals = N__*( ((N11 .* N22) - (N12 .* N21)).^2 ) ./ ... ((N1_ .* N2_ .* N_1 .* N_2) + Nzero); chi_pvals = 1-chi2cdf(chivals,1); stats_and_pvals(:,1:2) = [chivals chi_pvals]; %%%%%%%%%%%%%%%%%%%%%%%%%%% %end % End Processing: Compute chi-squared values %%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%% % Processing: Compute two-sample t-test values %while 0 %%%%%%%%%%%%%%%%%%%%%%%%%%%% sage2_prop_A = x_A_mat; % Normally, we would standardize % the counts to a common library size. % Here, that isn't necessary. sage2_prop_B = x_B_mat; t2vals = zeros(n_sim,2); mu_A = mean(sage2_prop_A')'; mu_B = mean(sage2_prop_B')'; var_A = var(sage2_prop_A')'/n_A; var_B = var(sage2_prop_B')'/n_B; zero_var = (var_A == 0) & (var_B == 0); dfvec = ((var_A + var_B).^2 + zero_var) ./ ... ( ((var_A.^2)/(n_A-1)) + ((var_B.^2)/(n_B-1)) + zero_var); t2valvec = (mu_A - mu_B) ./ sqrt(var_A + var_B + eps*zero_var); t2vals = [t2valvec dfvec]; t_pvals = 2*tcdf(-abs(t2vals(:,1)),t2vals(:,2)); stats_and_pvals(:,3:5) = [t2vals t_pvals]; %%%%%%%%%%%%%%%%%%%%%%%%%%% %end % End Processing: Compute two-sample t-test values %%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%% % Processing: Compute tw values %while 0 %%%%%%%%%%%%%%%%%%%%%%%%%%%% n_vecA = n*ones(n_A,1)'; n_vecB = n*ones(n_B,1)'; var_matA = zeros(n_sim,5); var_matB = zeros(n_sim,5); for(i = 1:n_sim) [a1,b1,c1,d1,e1] = fit_ab2(n_vecA,x_A_mat(i,:)); var_matA(i,:) = [a1 b1 c1 d1 e1]; [a2,b2,c2,d2,e2] = fit_ab2(n_vecB,x_B_mat(i,:)); var_matB(i,:) = [a2 b2 c2 d2 e2]; end nAvec = n_A * ones(n_sim,1); nBvec = n_B * ones(n_sim,1); zero_var = (var_matA(:,2) == 0) & (var_matB(:,2) == 0); tw = (var_matA(:,1) - var_matB(:,1)) ./ ... sqrt(var_matA(:,2) + var_matB(:,2) + eps*zero_var); dfvec = ((var_matA(:,2) + var_matB(:,2)).^2 + zero_var) ./ ... ( ((var_matA(:,2).^2)./(nAvec-1)) + ... ((var_matB(:,2).^2)./(nBvec-1)) + ... zero_var); twvals = [tw dfvec]; tw_pvals = 2*tcdf(-abs(twvals(:,1)),twvals(:,2)); stats_and_pvals(:,6:8) = [twvals tw_pvals]; %%%%%%%%%%%%%%%%%%%%%%%%%%% %end % End Processing: Compute tw values %%%%%%%%%%%%%%%%%%%%%%%%%%%%