Skip to content

Commit

Permalink
Update simulations with SVA analysis.
Browse files Browse the repository at this point in the history
  • Loading branch information
pbastide committed Nov 9, 2022
1 parent ccf5c82 commit cdbb045
Show file tree
Hide file tree
Showing 6 changed files with 179 additions and 28 deletions.
Binary file not shown.
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ Please see the header of each file for more a mode detailed description of each
* `04_simulations_data_analysis_results.R`: comparisons of the results from various methods on the same simulated datasets.
* `05_plot_results.R`: plot of the results as presented in the manuscript.
* `06_simulations_data_check.R`: check of the data using `countsimQC`.
* `07_data_analysis.R`: re-analysis of the Crayfish dataset.

* `data`: data issued from Stern et al. 2018.
* `crayfish.nodelabels.tre`: crayfish tree from Stern et al 2018.
Expand All @@ -36,6 +37,8 @@ Please see the header of each file for more a mode detailed description of each
* `full_result_table_1_50.rds`: scores of the various differential expression analysis methods applied on the various simulated datasets.
Produced by the `R` script `R_scripts/04_simulations_data_analysis_results.R`.

* `2022-11-02_2021-12-01_simulations_stern2018_results`: results of a more recent run of the analyses, that includes SVA tests.

## Requirements

* [`R`](https://cran.r-project.org/index.html). Version 4.0 or later.
Expand Down
61 changes: 56 additions & 5 deletions R_scripts/03_simulations_data_analysis.R
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ library(doParallel)
here_dir <- here()

name_data <- "stern2018"
datestamp_day_simus <- "2021-11-24" ## Change here to match simulation date
datestamp_day_simus <- "2021-12-01" ## Change here to match simulation date
simus_directory <- paste0(datestamp_day_simus, "_simulations_", name_data)
simus_directory <- file.path(here_dir, simus_directory)
load(file.path(simus_directory, "simulation_parameters.RData"))
Expand All @@ -54,11 +54,14 @@ dir.create(results_directory)
################################################################################
## Iterations to analyze
Nmin <- 1
Nmax <- 20
Nmax <- 50

## Required packages to pass to all nodes
reqpckg <- c("compcodeR", "here", "foreach")

## Keep only some prop var tree
all_prop_var_tree <- list(0.6, 0.8, 1.0)

## Number of cores
Ncores <- 3

Expand Down Expand Up @@ -157,7 +160,7 @@ foreach (tree_type = all_tree_types) %:%
## Remove some combinations of parameters
if(cond_type != "sights" && (lnorm != "TPM" || ltrans != "log2")) return(NULL)
if(block != "with_blocks" && (lnorm != "TPM" || ltrans != "log2")) return(NULL)
if(model_process == "OU" && (lnorm != "TPM" || ltrans != "log2" || cond_type != "sights" || block != "with_blocks")) return(NULL)
if(model_process == "OU" && (lnorm != "TPM" || ltrans != "log2" || block != "with_blocks")) return(NULL)
if(trend != "no_trend" && (lnorm != "TPM" || ltrans != "log2" || cond_type != "sights")) return(NULL)

## Run analysis
Expand Down Expand Up @@ -199,8 +202,8 @@ foreach (tree_type = all_tree_types) %:%

## Remove some combinations of parameters
if(cond_type != "sights" && (lnorm != "TPM" || ltrans != "log2")) return(NULL)
if(model_process == "OU" && (lnorm != "TPM" || ltrans != "log2" || cond_type != "sights")) return(NULL)
if(model_process == "OU" && (lnorm != "TPM" || ltrans != "log2")) return(NULL)

## Run analysis
runDiffExp(data.file = paste0(data_file, "_", i, ".rds"),
result.extent = method_name,
Expand All @@ -212,6 +215,54 @@ foreach (tree_type = all_tree_types) %:%
extra.design.covariates = NULL,
length.normalization = lnorm,
data.transformation = ltrans)

## For the first iteration only, generate the analysis code
if (i == 1) generateCodeHTMLs(file.path(results_directory, paste0(dataset, "_", i, "_", method_name, ".rds")), results_directory)
}

####################################################################
## SVA + limma
####################################################################
method <- "lengthNorm.sva.limma"

## Length normalization when there are lengths only
all_length_norm <- c("none")
if (use_lengths == "with_lengths") all_length_norm <- c("none", "RPKM", "TPM")
## Replicates correlation when there are replicates only
all_n.sv <- c("auto", "one")
if (!tree_type %in% c("no_tree", "us_star_tree")) all_blocks <- c("no_blocks", "with_blocks")

foreach (lnorm = all_length_norm) %:%
foreach (ltrans = c("log2", "sqrt")) %:%
foreach (trend = c("no_trend", "with_trend")) %:%
foreach (n.sv = all_n.sv) %:%
foreach(i = Nmin:Nmax, .packages = reqpckg, .verbose = TRUE) %do% {

## Method id
method_name <- paste(method, lnorm, ltrans, trend, n.sv, sep = ".")

## If replicate correlation, find the right blocks
# is_block <- NULL
# if (block == "with_blocks") is_block <- "id.species"

## Sanity check : if result file already exists, do not run the analysis
if (file.exists(file.path(results_directory, paste0(dataset, "_", i, "_", method_name, ".rds")))) return(NULL)

## Remove some combinations of parameters
if(cond_type != "sights" && (lnorm != "TPM" || ltrans != "log2")) return(NULL)
if(model_process == "OU" && (lnorm != "TPM" || ltrans != "log2")) return(NULL)
if(trend != "no_trend") return(NULL)

## Run analysis
runDiffExp(data.file = paste0(data_file, "_", i, ".rds"),
result.extent = method_name,
Rmdfunction = paste0(method, ".createRmd"),
output.directory = results_directory,
norm.method = "TMM",
length.normalization = lnorm,
data.transformation = ltrans,
trend = (trend == "with_trend"),
n.sv = ifelse(n.sv == "auto", "auto", 1))

## For the first iteration only, generate the analysis code
if (i == 1) generateCodeHTMLs(file.path(results_directory, paste0(dataset, "_", i, "_", method_name, ".rds")), results_directory)
Expand Down
38 changes: 36 additions & 2 deletions R_scripts/04_simulations_data_analysis_results.R
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,8 @@ library(compcodeR)
# here_dir <- sub("save", "work", here())
here_dir <- here()

datestamp_day_simus <- "2021-11-25" ## Change here for the simulation date
datestamp_day_anaysis <- "2021-11-25" ## Change here for the analysis date
datestamp_day_simus <- "2021-12-01" ## Change here for the simulation date
datestamp_day_anaysis <- "2022-11-02" ## Change here for the analysis date
name_data <- "stern2018"
simus_directory <- paste0(datestamp_day_simus, "_simulations_", name_data)
simus_directory <- file.path(here_dir, simus_directory)
Expand Down Expand Up @@ -215,6 +215,40 @@ for (dataset in all_datasets) {
res.table$methodMod[grepl(nn, res.table$de.methods)] <- paste0(res.table$methodMod[grepl(nn, res.table$de.methods)], "_cor")
}
res.table$blockCor[is.na(res.table$blockCor)] <- "none"
# sva
res.table$sva <- grepl("sva", res.table$de.methods)
# nsv
get_nsv <- function(res_sub) {
n_files <- nrow(res_sub)
all_nsv <- rep(NA, n_files)
for (i in seq_len(n_files)) {
bool <- grepl("sva", all_method_files) &
grepl(res_sub[i, "transformation"], all_method_files) &
grepl(res_sub[i, "trendeBayes"], all_method_files) &
grepl(res_sub[i, "use_lengths"], all_method_files) &
grepl(res_sub[i, "lengthNormalization"], all_method_files) &
grepl(paste0("\\.", res_sub[i, "nsvmethod"], "\\."), all_method_files) &
grepl(paste0("_", res_sub[i, "repl"], "_lengthNorm"), all_method_files)
tmp <- readRDS(all_method_files[bool])
if (length(tmp@result.table$n.sv) == 0) {
print(res_sub[i, "de.methods"])
print(all_method_files[bool])
all_nsv[i] <- NA
} else {
all_nsv[i] <- tmp@result.table$n.sv[1]
}
}
return(all_nsv)
}
res.table$nsvmethod <- NA
for (nn in c("n\\.sv\\.auto", "n\\.sv\\.1")) {
res.table$nsvmethod[grepl(nn, res.table$de.methods)] <- ifelse(nn == "n\\.sv\\.auto", "auto", "one")
res.table$methodMod[grepl(nn, res.table$de.methods)] <- paste0(res.table$methodMod[grepl(nn, res.table$de.methods)], "_sva_", ifelse(nn == "n\\.sv\\.auto", "auto", "one"))
}
res.table$nsv <- 0
for (nn in c("n\\.sv\\.auto", "n\\.sv\\.1")) {
res.table$nsv[grepl(nn, res.table$de.methods)] <- get_nsv(res.table[grepl(nn, res.table$de.methods), ])
}

## Effective Sample Size
oneres <- readRDS(file.table$input.files[1])
Expand Down
89 changes: 68 additions & 21 deletions R_scripts/05_plot_results.R
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ library(here)
## File management
################################################################################
datestamp_day_simus <- "2021-12-01" ## Change here for the simulation date
datestamp_day_anaysis <- "2021-12-01" ## Change here for the analysis date
datestamp_day_anaysis <- "2022-11-02" ## Change here for the analysis date
name_data <- "stern2018"
simus_directory <- paste0(datestamp_day_simus, "_simulations_", name_data)
simus_directory <- here(simus_directory)
Expand Down Expand Up @@ -78,6 +78,9 @@ result.table$Design <- factor(result.table$Design, levels = c("block", "sight",
result.table$prop_var_tree <- as.factor(result.table$prop_var_tree)
result.table$effect_size <- as.factor(result.table$effect_size)

result.table$methodMod[grepl("n.sv.auto", result.table$de.methods)] <- "limma_sva_auto"
result.table$nsvmethod[grepl("n.sv.auto", result.table$de.methods)] <- "auto"

## Format for plot
result.table_plot <- reshape2::melt(result.table,
measure.vars = c("AUC", "MCC", "FDR", "TPR"),
Expand All @@ -95,7 +98,9 @@ pretty_names <- c("DESeq2" = "DESeq2",
"limma_trend_cor" = "limma trend (cor)",
"limma_trend" = "limma trend",
"phylolm_BM" = "phylolm (BM)",
"phylolm_OU" = "phylolm (OU)")
"phylolm_OU" = "phylolm (OU)",
"limma_sva_one" = "limma sva (one)",
"limma_sva_auto" = "limma sva (auto)")

pretty_names_tree <- c("no_tree" = "NB",
"us_star_tree" = "pPLN (star tree)",
Expand All @@ -120,13 +125,7 @@ df <- subset(result.table_plot,
& fact_disp == base_fact_disp
& score %in% c("FDR")
& model_process %in% c("NB", "BM")
& !(methodMod %in% c(
"limma_cor",
"phylolm_BM",
"phylolm_OU",
"limma_trend",
"limma_trend_cor"
))
& methodMod %in% c("DESeq2", "limma")
)
df$methodMod <- factor(df$methodMod, levels = c("DESeq2", "limma"))
df$tree_type <- factor(df$tree_type, levels = c("no_tree",
Expand Down Expand Up @@ -182,12 +181,12 @@ df <- subset(result.table_plot,
"limma_trend_cor"))
)
df <- subset(df, use_lengths == "with_lengths" & tree_type == "real_tree")
df$methodMod <- factor(df$methodMod, levels = c("DESeq2", "limma", "limma_cor", "phylolm_BM",
df$methodMod <- factor(df$methodMod, levels = c("DESeq2", "limma", "limma_sva_one", "limma_sva_auto", "limma_cor", "phylolm_BM",
"phylolm_OU"
))
levels(df$methodMod) <- pretty_names[levels(df$methodMod)]

if (nrow(df) / 3 / 3 / 5 != Nrep) message("wrong dimension")
if (nrow(df) / 3 / 3 / 7 != Nrep) message("wrong dimension")
if (anyNA(df$score_value)) message("Some NAs in the data")

dodge <- position_dodge(width = 0)
Expand Down Expand Up @@ -227,11 +226,10 @@ df <- subset(result.table_plot,
& effect_size == base_effect_size
& prop_var_tree == base_prop_var_tree
& fact_disp == base_fact_disp
& cond_type == base_cond_type
& score %in% c("MCC")
& !(methodMod %in% c(
"limma", "DESeq2",
"limma_trend", "limma_trend_cor"))
# & cond_type == base_cond_type
& score %in% plot_scores
& methodMod %in% c(
"phylolm_BM", "phylolm_OU", "limma_cor")
)
df <- subset(df, use_lengths == "with_lengths" & tree_type == "real_tree")
df$methodMod <- factor(df$methodMod,
Expand All @@ -243,11 +241,11 @@ df$methodMod <- factor(df$methodMod,

levels(df$methodMod) <- pretty_names[levels(df$methodMod)]

if (nrow(df) / 1 / 3 / 2 != Nrep) message("wrong dimension")
if (nrow(df) / 3 / 3 / 3 / 2 != Nrep) message("wrong dimension")
if (anyNA(df$score_value)) message("Some NAs in the data")

ggplot(df, aes(x = methodMod, y = score_value, fill = model_process, color = model_process)) +
facet_grid(score ~ ., scales = "free_y", switch = "y", labeller = labeller(Design = label_both)) +
facet_grid(score ~ cond_type, scales = "free_y", switch = "y", labeller = labeller(Design = label_both)) +
geom_boxplot(alpha = 0.3, outlier.size = 0.5, show.legend = FALSE,
position = "identity", width = 0.2*3/5,
aes(group = interaction(effect_size, methodMod, Design, model_process, factor(fact_disp), prop_var_tree))) +
Expand Down Expand Up @@ -285,9 +283,8 @@ df <- subset(result.table_plot,
& cond_type == base_cond_type
& score %in% c("MCC")
& model_process == "BM"
& !(methodMod %in% c(
"limma", "DESeq2",
"limma_trend", "limma_trend_cor"))
& methodMod %in% c("limma_cor",
"phylolm_BM", "phylolm_OU")
)
df <- subset(df, use_lengths == "with_lengths" & tree_type == "real_tree")
df$methodMod <- factor(df$methodMod,
Expand Down Expand Up @@ -597,3 +594,53 @@ pb <- ggplot(aa) +
aligned_plots <- align_plots(pp, pb, align="hv", axis="tlr")
ggdraw(aligned_plots[[1]]) + draw_plot(aligned_plots[[2]])
# colorblindr::cvd_grid(ggdraw(aligned_plots[[1]]) + draw_plot(aligned_plots[[2]]))

###############################################################################
## Figure : SVA nsv
################################################################################
df <- subset(result.table,
lengthNormalization %in% c("TPM", "length")
& transformation %in% c("log2", "none")
& effect_size == base_effect_size
& fact_disp == base_fact_disp
& prop_var_tree == base_prop_var_tree
& model_process == "BM"
# & score %in% c(plot_scores, "nsv")
& methodMod %in% c(
"limma_sva_auto")
)
df <- subset(df, use_lengths == "with_lengths" & tree_type == "real_tree")
df$methodMod <- factor(df$methodMod, levels = c("DESeq2", "limma", "limma_cor", "limma_sva_one", "limma_sva_auto", "phylolm_BM",
"phylolm_OU"
))
levels(df$methodMod) <- pretty_names[levels(df$methodMod)]

if (nrow(df) / 3 / 1 / 1 != Nrep) message("wrong dimension")
if (anyNA(df$score_value)) message("Some NAs in the data")

dodge <- position_dodge(width = 0)

ggplot(df, aes(x = Design, y = nsv, fill = Design, color = Design)) +
# geom_hline(aes(yintercept = threshold), linetype = 2) +
# facet_grid(nsv ~ ., scales = "free", switch = "y", labeller = labeller(Design = label_both)) +
geom_boxplot(alpha = 0.3, outlier.size = 0.5, show.legend = FALSE,
position = "identity", width = 0.2,
aes(group = interaction(effect_size, use_lengths, tree_type, methodMod, prop_var_tree, Design))) +
stat_summary(fun = median, geom = 'line', aes(group = interaction(effect_size, use_lengths, tree_type, prop_var_tree), colour = Design)) +
scale_fill_viridis_d(guide = "none", end = 0.8, option = "B") +
scale_colour_viridis_d(end = 0.8, option = "B", guide ="none") +
ylab("Number of Surogate Variables") +
# xlab("") +
theme_bw() +
theme(text = element_text(size = 9),
title = element_text(size = 7),
# strip.text.y = element_text(vjust = -3),
# strip.placement = "outside",
# strip.background = element_blank(),
# panel.grid.minor = element_blank(),
legend.position = c(0.01, 0.01),
legend.justification = c("left", "bottom"),
legend.key.size = unit(10, 'pt'),
# plot.margin = unit(c(0.1,0.1,-0.31,-0.5), "cm")
)
# colorblindr::cvd_grid
16 changes: 16 additions & 0 deletions R_scripts/07_data_analysis.R
Original file line number Diff line number Diff line change
Expand Up @@ -287,6 +287,22 @@ diff_002
match_002 <- nrow(na.omit(limma_stern_002)) - nrow(na.omit(limma_stern))
match_002

##################################################################
## RPKM
##################################################################
## Limma Cor RPKM
th <- 0.05
results_rpkm_cor <- readRDS(here(results_directory, "stern2018_cpd_lengthNorm.limma.RPKM.log2.with_trend.with_blocks.rds"))
results_rpkm_cor <- subset(results_rpkm_cor@result.table, adjpvalue <= th)
results_rpkm_cor$Orthogroup <- rownames(results_rpkm_cor)
results_rpkm_cor$Uniprot.top.hit <- all_ogs[match(results_rpkm_cor$Orthogroup, all_ogs$Orthogroup), "Uniprot.top.hit"]
results_rpkm_cor
# Match Stern - Limma Cor
rpkm_stern <- results_stern[match(results_rpkm_cor$Orthogroup, results_stern$Orthogroup), -2]
rpkm_stern
limma_rpkm <- results_limma_cor[match(results_rpkm_cor$Orthogroup, results_limma_cor$Orthogroup), -2]
limma_rpkm

# BEGIN DO NOT RUN
####################################################################
## Limma by clades - Analysis
Expand Down

0 comments on commit cdbb045

Please sign in to comment.