diff --git a/.gitignore b/.gitignore index 0a0275d..18dc1f5 100644 --- a/.gitignore +++ b/.gitignore @@ -13,5 +13,9 @@ dev/false_positive_study.RData dev/false_positive_study.pdf dev/*rds dev/*rda +dev/*csv +dev/*xlsx +dev/*RData +dev/*txt dev/approximation_bias_correction_cache* temp* diff --git a/R/ppcseq.R b/R/ppcseq.R index 0ec98d0..5af8588 100644 --- a/R/ppcseq.R +++ b/R/ppcseq.R @@ -64,7 +64,8 @@ identify_outliers = function(.data, tol_rel_obj = 0.01, just_discovery = F, seed = sample(1:99999, size = 1), - adj_prob_theshold_2 = NULL + adj_prob_theshold_2 = NULL, + return_fit = FALSE ) { # Prepare column same enquo .sample = enquo(.sample) @@ -281,23 +282,32 @@ identify_outliers = function(.data, merge_results( # Calculate CI 2 for discovery for plotting - res_discovery %>% - left_join( - (.) %>% - attr("fit") %>% - fit_to_counts_rng_approximated(adj_prob_theshold_2, how_many_posterior_draws_2, truncation_compensation = 0.7352941, cores) %>% - select(S, G, .lower_1 = .lower, .upper_1 = .upper) - ), - res_test, formula, + res_discovery, + + # Just used for article comparative analyses between 1 and 2 steps + # %>% + # left_join( + # (.) %>% + # attr("fit") %>% + # fit_to_counts_rng_approximated(adj_prob_theshold_2, how_many_posterior_draws_2, truncation_compensation = 0.7352941, cores) %>% + # select(S, G, .lower_1 = .lower, .upper_1 = .upper) + # ), + + res_test, + formula, !!.transcript, !!.abundance, !!.sample, do_check_only_on_detrimental ) %>% - # Add fit attribute if any - add_attr(res_discovery %>% attr("fit"), "fit 1") %>% - add_attr(res_test %>% attr("fit"), "fit 2") %>% + # If return_fit + when( + return_fit ~ (.) %>% + add_attr(res_discovery %>% attr("fit"), "fit 1") %>% + add_attr(res_test %>% attr("fit"), "fit 2"), + ~ (.) + ) %>% # Add total draws add_attr(res_test %>% attr("total_draws"), "total_draws") @@ -483,7 +493,7 @@ do_inference = function(my_df, as_matrix() %>% t # Get the matrix of the idexes of the outlier data points - # to explude from the model if it is the second passage + # to exclude from the model if it is the second passage to_exclude_MPI = get_outlier_data_to_exlude(counts_MPI, to_exclude, shards) # Package data @@ -547,7 +557,7 @@ do_inference = function(my_df, ifelse_pipe( approximate_posterior_analysis, - ~ .x %>% fit_to_counts_rng_approximated(adj_prob_theshold, how_many_posterior_draws * 10, truncation_compensation, cores), + ~ .x %>% fit_to_counts_rng_approximated(adj_prob_theshold, how_many_posterior_draws * 10, truncation_compensation, cores, how_many_to_check), ~ .x %>% fit_to_counts_rng(adj_prob_theshold) ) %>% diff --git a/R/utilities.R b/R/utilities.R index 8473ab9..e7190ef 100644 --- a/R/utilities.R +++ b/R/utilities.R @@ -448,8 +448,8 @@ merge_results = function(res_discovery, res_test, formula, .transcript, .abundan !!.abundance, !!.sample, mean, - `.lower_1`, - `.upper_1`, + # `.lower_1`, + # `.upper_1`, `exposure rate`, slope_1 = slope, one_of(parse_formula(formula)) @@ -525,8 +525,6 @@ format_results = function(.data, formula, .transcript, .abundance, .sample, do_c ) } - - #' Select only significant genes plus background for efficient normalisation #' #' @importFrom rstan sampling @@ -564,7 +562,6 @@ select_to_check_and_house_keeping = function(.data, .do_check, .significance, .t } } - #' add_exposure_rate #' #' @importFrom tidyr separate @@ -655,49 +652,123 @@ fit_to_counts_rng = function(fit, adj_prob_theshold){ #' #' @export -fit_to_counts_rng_approximated = function(fit, adj_prob_theshold, how_many_posterior_draws, truncation_compensation, cores){ +fit_to_counts_rng_approximated = function(fit, adj_prob_theshold, how_many_posterior_draws, truncation_compensation, cores, how_many_to_check){ writeLines(sprintf("executing %s", "fit_to_counts_rng_approximated")) - draws_mu = - fit %>% extract("lambda_log_param") %>% `[[` (1) %>% as.data.frame() %>% setNames(sprintf("mu.%s", colnames(.))) %>% - as_tibble() %>% mutate(.draw = 1:n()) %>% gather(par, mu, -.draw) %>% separate(par, c("par", "S", "G"), sep="\\.") %>% select(-par) - draws_sigma = - fit %>% extract("sigma_raw") %>% `[[` (1) %>% as.data.frame() %>% setNames(sprintf("sigma.%s", colnames(.) %>% gsub("V", "", .))) %>% - as_tibble() %>% mutate(.draw = 1:n()) %>% gather(par, sigma, -.draw) %>% separate(par, c("par", "G"), sep="\\.") %>% select(-par) - draws_exposure = - fit %>% extract("exposure_rate") %>% `[[` (1) %>% as.data.frame() %>% setNames(sprintf("exposure.%s", colnames(.) %>% gsub("V", "", .))) %>% - as_tibble() %>% mutate(.draw = 1:n()) %>% gather(par, exposure, -.draw) %>% separate(par, c("par", "S"), sep="\\.") %>% select(-par) - - draws_mu %>% - left_join(draws_sigma, by = c(".draw", "G")) %>% - left_join(draws_exposure, by = c(".draw", "S")) %>% - nest(data = -c(S, G)) %>% + draws_mu = fit %>% extract("lambda_log_param") %>% .[[1]] %>% .[,,1:how_many_to_check] + + # %>% as.data.frame() %>% setNames(sprintf("mu.%s", colnames(.))) %>% + # as_tibble() %>% mutate(.draw = 1:n()) %>% gather(par, mu, -.draw) %>% separate(par, c("par", "S", "G"), sep="\\.") %>% select(-par) + + draws_sigma = fit %>% extract("sigma_raw") %>% .[[1]] %>% .[,1:how_many_to_check] + + # %>% as.data.frame() %>% setNames(sprintf("sigma.%s", colnames(.) %>% gsub("V", "", .))) %>% + # as_tibble() %>% mutate(.draw = 1:n()) %>% gather(par, sigma, -.draw) %>% separate(par, c("par", "G"), sep="\\.") %>% select(-par) + + draws_exposure = fit %>% extract("exposure_rate") %>% .[[1]] + + # %>% as.data.frame() %>% setNames(sprintf("exposure.%s", colnames(.) %>% gsub("V", "", .))) %>% + # as_tibble() %>% mutate(.draw = 1:n()) %>% gather(par, exposure, -.draw) %>% separate(par, c("par", "S"), sep="\\.") %>% select(-par) + + expand_grid(S = 1:(dim(draws_mu)[2]), G = 1:(dim(draws_mu)[3])) %>% + mutate(truncation_compensation = !!truncation_compensation) %>% mutate( - CI = map( - data, - ~ { - .x_supersampled = .x %>% sample_n(how_many_posterior_draws, replace = T) - draws = rnbinom(n =how_many_posterior_draws, mu = exp(.x_supersampled$mu + .x_supersampled$exposure), size = 1/exp(.x_supersampled$sigma) * truncation_compensation ) - draws %>% - # Process quantile - quantile(c(adj_prob_theshold, 1 - adj_prob_theshold)) %>% - tibble::as_tibble(rownames="prop") %>% - tidyr::spread(prop, value) %>% - setNames(c(".lower", ".upper")) %>% - # Add mean and sd - dplyr::mutate(mean = mean(draws), sd = sd(draws)) - } + CI = pmap( + list( S,G, truncation_compensation), + ~ tibble( + mu = draws_mu[,..1, ..2], + sigma = draws_sigma[,..2] , + exposure = draws_exposure[,..1], + truncation_compensation = ..3 + ) %>% + get_CI_semi_analytically_pnbinom( adj_prob_theshold ) %>% + + # Add mean and sd + dplyr::mutate( + mean = mean(exp(draws_mu[,..1, ..2] + draws_exposure[,..1])), + sd = mean(1/exp(draws_sigma[,..2]) * truncation_compensation) + ) ) ) %>% - select(-data) %>% - unnest(CI) %>% - - # Adapt to old dataset mutate(.variable = "counts_rng") %>% - mutate(S = as.integer(S), G = as.integer(G)) + unnest(CI) + + # draws_mu %>% + # left_join(draws_sigma, by = c(".draw", "G")) %>% + # left_join(draws_exposure, by = c(".draw", "S")) %>% + # nest(data = -c(S, G)) %>% + # mutate( + # CI = future_map( + # data, + # ~ { + # get_CI_semi_analytically_pnbinom( + # .x, + # adj_prob_theshold + # ) %>% + # # Add mean and sd + # dplyr::mutate( + # mean = mean(exp(.x$mu + .x$exposure)), + # sd = mean(1/exp(.x$sigma) * truncation_compensation) + # ) + # } + # ) + # ) %>% + # select(-data) %>% + # unnest(CI) %>% + # + # # Adapt to old dataset + # mutate(.variable = "counts_rng") %>% + # mutate(S = as.integer(S), G = as.integer(G)) +} + +get_CI_semi_analytically_pnbinom_core = function(.x, .quantile){ + ab<-range( + qnbinom( + .quantile, + mu = exp(.x$mu + .x$exposure), + size = 1/exp(.x$sigma) * .x$truncation_compensation + ) + ); + + if(sum(ab) == 0) return(0); + + opt<-optim( + mean(ab), + function (x) + sapply( + x, + function(p) + ((.quantile)-mean(pnbinom(p, mu = exp(.x$mu + .x$exposure), + size = 1/exp(.x$sigma) * .x$truncation_compensation )))^2 + ), + method='Brent', + lower=ab[1], + upper=ab[2] + ) + opt$par +} + +get_CI_semi_analytically_pnbinom = function(.x, .quantile){ + + tibble( + .lower = .x %>% get_CI_semi_analytically_pnbinom_core(.quantile), + .upper = .x %>% get_CI_semi_analytically_pnbinom_core(1-.quantile) + ) + + +} +get_CI_semi_analytically_rnbinom = function(.x, .quantile, how_many_posterior_draws){ + .x_supersampled = .x %>% sample_n(how_many_posterior_draws, replace = T) + draws = rnbinom(n =how_many_posterior_draws, mu = exp(.x_supersampled$mu + .x_supersampled$exposure), size = 1/exp(.x_supersampled$sigma) * .x$truncation_compensation ) + draws %>% + # Process quantile + quantile(c(.quantile, 1 - .quantile)) %>% + tibble::as_tibble(rownames="prop") %>% + tidyr::spread(prop, value) %>% + setNames(c(".lower", ".upper")) } save_generated_quantities_in_case = function(.data, fit, save_generated_quantities){ diff --git a/inst/stan/negBinomial_MPI.stan b/inst/stan/negBinomial_MPI.stan index b00dafe..2baa956 100644 --- a/inst/stan/negBinomial_MPI.stan +++ b/inst/stan/negBinomial_MPI.stan @@ -112,6 +112,7 @@ functions{ } matrix merge_coefficients(row_vector intercept, row_vector alpha_sub_1, matrix alpha_2, int C, int S, int G){ + // Here I build the coefficient matrix appending the house keeping genes matrix[C,G] my_alpha;