Update simulations with SVA analysis.

i2bc · Nov 9, 2022 · cdbb045 · cdbb045
1 parent ccf5c82
commit cdbb045
Show file tree

Hide file tree

Showing 6 changed files with 179 additions and 28 deletions.
diff --git a/2022-11-02_2021-12-01_simulations_stern2018_results/full_result_table_1_50.rds b/2022-11-02_2021-12-01_simulations_stern2018_results/full_result_table_1_50.rds
diff --git a/README.md b/README.md
@@ -15,6 +15,7 @@ Please see the header of each file for more a mode detailed description of each
   * `04_simulations_data_analysis_results.R`: comparisons of the results from various methods on the same simulated datasets.
   * `05_plot_results.R`: plot of the results as presented in the manuscript.
   * `06_simulations_data_check.R`: check of the data using `countsimQC`.
+  * `07_data_analysis.R`: re-analysis of the Crayfish dataset.
 
 * `data`: data issued from Stern et al. 2018.
   * `crayfish.nodelabels.tre`: crayfish tree from Stern et al 2018.
@@ -36,6 +37,8 @@ Please see the header of each file for more a mode detailed description of each
   * `full_result_table_1_50.rds`: scores of the various differential expression analysis methods applied on the various simulated datasets.
     Produced by the `R` script `R_scripts/04_simulations_data_analysis_results.R`.
 
+* `2022-11-02_2021-12-01_simulations_stern2018_results`: results of a more recent run of the analyses, that includes SVA tests.
+
 ## Requirements
 
 * [`R`](https://cran.r-project.org/index.html). Version 4.0 or later.

diff --git a/R_scripts/03_simulations_data_analysis.R b/R_scripts/03_simulations_data_analysis.R
@@ -35,7 +35,7 @@ library(doParallel)
 here_dir <- here()
 
 name_data <- "stern2018"
-datestamp_day_simus <- "2021-11-24" ## Change here to match simulation date
+datestamp_day_simus <- "2021-12-01" ## Change here to match simulation date
 simus_directory <- paste0(datestamp_day_simus, "_simulations_", name_data)
 simus_directory <- file.path(here_dir, simus_directory)
 load(file.path(simus_directory, "simulation_parameters.RData"))
@@ -54,11 +54,14 @@ dir.create(results_directory)
 ################################################################################
 ## Iterations to analyze
 Nmin <- 1
-Nmax <- 20
+Nmax <- 50
 
 ## Required packages to pass to all nodes
 reqpckg <- c("compcodeR", "here", "foreach")
 
+## Keep only some prop var tree
+all_prop_var_tree <- list(0.6, 0.8, 1.0)
+
 ## Number of cores
 Ncores <- 3
 
@@ -157,7 +160,7 @@ foreach (tree_type = all_tree_types) %:%
         ## Remove some combinations of parameters
         if(cond_type != "sights" && (lnorm != "TPM" || ltrans != "log2")) return(NULL)
         if(block != "with_blocks" && (lnorm != "TPM" || ltrans != "log2")) return(NULL)
-        if(model_process == "OU" && (lnorm != "TPM" || ltrans != "log2" || cond_type != "sights" || block != "with_blocks")) return(NULL)
+        if(model_process == "OU" && (lnorm != "TPM" || ltrans != "log2" || block != "with_blocks")) return(NULL)
         if(trend != "no_trend" && (lnorm != "TPM" || ltrans != "log2" || cond_type != "sights")) return(NULL)
 
         ## Run analysis
@@ -199,8 +202,8 @@ foreach (tree_type = all_tree_types) %:%
 
           ## Remove some combinations of parameters
           if(cond_type != "sights" && (lnorm != "TPM" || ltrans != "log2")) return(NULL)
-          if(model_process == "OU" && (lnorm != "TPM" || ltrans != "log2" || cond_type != "sights")) return(NULL)
-          
+          if(model_process == "OU" && (lnorm != "TPM" || ltrans != "log2")) return(NULL)
+
           ## Run analysis
           runDiffExp(data.file = paste0(data_file, "_", i, ".rds"),
                      result.extent = method_name,
@@ -212,6 +215,54 @@ foreach (tree_type = all_tree_types) %:%
                      extra.design.covariates = NULL,
                      length.normalization = lnorm,
                      data.transformation = ltrans)
+
+          ## For the first iteration only, generate the analysis code
+          if (i == 1) generateCodeHTMLs(file.path(results_directory, paste0(dataset, "_", i, "_", method_name, ".rds")), results_directory)
+        }
+
+      ####################################################################
+      ## SVA + limma
+      ####################################################################
+      method <- "lengthNorm.sva.limma"
+
+      ## Length normalization when there are lengths only
+      all_length_norm <- c("none")
+      if (use_lengths == "with_lengths") all_length_norm <- c("none", "RPKM", "TPM")
+      ## Replicates correlation when there are replicates only
+      all_n.sv <- c("auto", "one")
+      if (!tree_type %in% c("no_tree", "us_star_tree")) all_blocks <- c("no_blocks", "with_blocks")
+
+      foreach (lnorm =  all_length_norm) %:%
+        foreach (ltrans = c("log2", "sqrt")) %:%
+        foreach (trend = c("no_trend", "with_trend")) %:%
+        foreach (n.sv = all_n.sv) %:%
+        foreach(i = Nmin:Nmax, .packages = reqpckg, .verbose = TRUE) %do% {
+
+          ## Method id
+          method_name <- paste(method, lnorm, ltrans, trend, n.sv, sep = ".")
+
+          ## If replicate correlation, find the right blocks
+          # is_block <- NULL
+          # if (block == "with_blocks") is_block <- "id.species"
+
+          ## Sanity check : if result file already exists, do not run the analysis
+          if (file.exists(file.path(results_directory, paste0(dataset, "_", i, "_", method_name, ".rds")))) return(NULL)
+
+          ## Remove some combinations of parameters
+          if(cond_type != "sights" && (lnorm != "TPM" || ltrans != "log2")) return(NULL)
+          if(model_process == "OU" && (lnorm != "TPM" || ltrans != "log2")) return(NULL)
+          if(trend != "no_trend") return(NULL)
+
+          ## Run analysis
+          runDiffExp(data.file = paste0(data_file, "_", i, ".rds"),
+                     result.extent = method_name,
+                     Rmdfunction = paste0(method, ".createRmd"),
+                     output.directory = results_directory,
+                     norm.method = "TMM",
+                     length.normalization = lnorm,
+                     data.transformation = ltrans,
+                     trend = (trend == "with_trend"),
+                     n.sv = ifelse(n.sv == "auto", "auto", 1))
 
           ## For the first iteration only, generate the analysis code
           if (i == 1) generateCodeHTMLs(file.path(results_directory, paste0(dataset, "_", i, "_", method_name, ".rds")), results_directory)

diff --git a/R_scripts/04_simulations_data_analysis_results.R b/R_scripts/04_simulations_data_analysis_results.R
@@ -32,8 +32,8 @@ library(compcodeR)
 # here_dir <- sub("save", "work", here())
 here_dir <- here()
 
-datestamp_day_simus <- "2021-11-25" ## Change here for the simulation date
-datestamp_day_anaysis <- "2021-11-25" ## Change here for the analysis date
+datestamp_day_simus <- "2021-12-01" ## Change here for the simulation date
+datestamp_day_anaysis <- "2022-11-02" ## Change here for the analysis date
 name_data <- "stern2018"
 simus_directory <- paste0(datestamp_day_simus, "_simulations_", name_data)
 simus_directory <- file.path(here_dir, simus_directory)
@@ -215,6 +215,40 @@ for (dataset in all_datasets) {
     res.table$methodMod[grepl(nn, res.table$de.methods)] <- paste0(res.table$methodMod[grepl(nn, res.table$de.methods)], "_cor")
   }
   res.table$blockCor[is.na(res.table$blockCor)] <- "none"
+  # sva
+  res.table$sva <- grepl("sva", res.table$de.methods)
+  # nsv
+  get_nsv <- function(res_sub) {
+    n_files <- nrow(res_sub)
+    all_nsv <- rep(NA, n_files)
+    for (i in seq_len(n_files)) {
+      bool <- grepl("sva", all_method_files) & 
+        grepl(res_sub[i, "transformation"], all_method_files) &
+        grepl(res_sub[i, "trendeBayes"], all_method_files) &
+        grepl(res_sub[i, "use_lengths"], all_method_files) &
+        grepl(res_sub[i, "lengthNormalization"], all_method_files) &
+        grepl(paste0("\\.", res_sub[i, "nsvmethod"], "\\."), all_method_files) &
+        grepl(paste0("_", res_sub[i, "repl"], "_lengthNorm"), all_method_files)
+      tmp <- readRDS(all_method_files[bool])
+      if (length(tmp@result.table$n.sv) == 0) {
+        print(res_sub[i, "de.methods"])
+        print(all_method_files[bool])
+        all_nsv[i] <- NA
+      } else {
+        all_nsv[i] <- tmp@result.table$n.sv[1]
+      }
+    }
+    return(all_nsv)
+  }
+  res.table$nsvmethod <- NA
+  for (nn in c("n\\.sv\\.auto", "n\\.sv\\.1")) {
+    res.table$nsvmethod[grepl(nn, res.table$de.methods)] <- ifelse(nn == "n\\.sv\\.auto", "auto", "one")
+    res.table$methodMod[grepl(nn, res.table$de.methods)] <- paste0(res.table$methodMod[grepl(nn, res.table$de.methods)], "_sva_", ifelse(nn == "n\\.sv\\.auto", "auto", "one"))
+  }
+  res.table$nsv <- 0
+  for (nn in c("n\\.sv\\.auto", "n\\.sv\\.1")) {
+    res.table$nsv[grepl(nn, res.table$de.methods)] <- get_nsv(res.table[grepl(nn, res.table$de.methods), ])
+  } 
 
   ## Effective Sample Size
   oneres <- readRDS(file.table$input.files[1])

diff --git a/R_scripts/05_plot_results.R b/R_scripts/05_plot_results.R
@@ -30,7 +30,7 @@ library(here)
 ## File management
 ################################################################################
 datestamp_day_simus <- "2021-12-01" ## Change here for the simulation date
-datestamp_day_anaysis <- "2021-12-01" ## Change here for the analysis date
+datestamp_day_anaysis <- "2022-11-02" ## Change here for the analysis date
 name_data <- "stern2018"
 simus_directory <- paste0(datestamp_day_simus, "_simulations_", name_data)
 simus_directory <- here(simus_directory)
@@ -78,6 +78,9 @@ result.table$Design <- factor(result.table$Design, levels = c("block", "sight",
 result.table$prop_var_tree <- as.factor(result.table$prop_var_tree)
 result.table$effect_size <- as.factor(result.table$effect_size)
 
+result.table$methodMod[grepl("n.sv.auto", result.table$de.methods)] <- "limma_sva_auto"
+result.table$nsvmethod[grepl("n.sv.auto", result.table$de.methods)] <- "auto"
+
 ## Format for plot
 result.table_plot <- reshape2::melt(result.table,
                                     measure.vars = c("AUC", "MCC", "FDR", "TPR"),
@@ -95,7 +98,9 @@ pretty_names <- c("DESeq2" = "DESeq2",
                   "limma_trend_cor" = "limma trend (cor)",
                   "limma_trend" = "limma trend",
                   "phylolm_BM" = "phylolm (BM)",
-                  "phylolm_OU" = "phylolm (OU)")
+                  "phylolm_OU" = "phylolm (OU)",
+                  "limma_sva_one" = "limma sva (one)",
+                  "limma_sva_auto" = "limma sva (auto)")
 
 pretty_names_tree <- c("no_tree" = "NB",
                        "us_star_tree" = "pPLN (star tree)",
@@ -120,13 +125,7 @@ df <- subset(result.table_plot,
              & fact_disp == base_fact_disp
              & score %in% c("FDR")
              & model_process %in% c("NB", "BM")
-             & !(methodMod %in% c(
-               "limma_cor",
-               "phylolm_BM",
-               "phylolm_OU",
-               "limma_trend",
-               "limma_trend_cor"
-             ))
+             & methodMod %in% c("DESeq2", "limma")
 )
 df$methodMod <- factor(df$methodMod, levels = c("DESeq2", "limma"))
 df$tree_type <- factor(df$tree_type, levels = c("no_tree",
@@ -182,12 +181,12 @@ df <- subset(result.table_plot,
                "limma_trend_cor"))
 )
 df <- subset(df, use_lengths ==  "with_lengths" & tree_type == "real_tree")
-df$methodMod <- factor(df$methodMod, levels = c("DESeq2", "limma", "limma_cor", "phylolm_BM",
+df$methodMod <- factor(df$methodMod, levels = c("DESeq2", "limma", "limma_sva_one", "limma_sva_auto", "limma_cor", "phylolm_BM",
                                                 "phylolm_OU"
 ))
 levels(df$methodMod) <- pretty_names[levels(df$methodMod)]
 
-if (nrow(df) / 3 / 3 / 5 != Nrep) message("wrong dimension")
+if (nrow(df) / 3 / 3 / 7 != Nrep) message("wrong dimension")
 if (anyNA(df$score_value)) message("Some NAs in the data")
 
 dodge <- position_dodge(width = 0)
@@ -227,11 +226,10 @@ df <- subset(result.table_plot,
              & effect_size == base_effect_size
              & prop_var_tree == base_prop_var_tree
              & fact_disp == base_fact_disp
-             & cond_type == base_cond_type
-             & score %in% c("MCC")
-             & !(methodMod %in% c(
-               "limma", "DESeq2",
-               "limma_trend", "limma_trend_cor"))
+             # & cond_type == base_cond_type
+             & score %in% plot_scores
+             & methodMod %in% c(
+               "phylolm_BM", "phylolm_OU", "limma_cor")
 )
 df <- subset(df, use_lengths ==  "with_lengths" & tree_type == "real_tree")
 df$methodMod <- factor(df$methodMod,
@@ -243,11 +241,11 @@ df$methodMod <- factor(df$methodMod,
 
 levels(df$methodMod) <- pretty_names[levels(df$methodMod)]
 
-if (nrow(df) / 1 / 3 / 2 != Nrep) message("wrong dimension")
+if (nrow(df) / 3 / 3 / 3 / 2 != Nrep) message("wrong dimension")
 if (anyNA(df$score_value)) message("Some NAs in the data")
 
 ggplot(df, aes(x = methodMod, y = score_value, fill = model_process, color = model_process)) +
-  facet_grid(score ~ ., scales = "free_y", switch = "y", labeller = labeller(Design = label_both)) +
+  facet_grid(score ~ cond_type, scales = "free_y", switch = "y", labeller = labeller(Design = label_both)) +
   geom_boxplot(alpha = 0.3, outlier.size = 0.5, show.legend = FALSE,
                position = "identity", width = 0.2*3/5,
                aes(group = interaction(effect_size, methodMod, Design, model_process, factor(fact_disp), prop_var_tree))) +
@@ -285,9 +283,8 @@ df <- subset(result.table_plot,
              & cond_type == base_cond_type
              & score %in% c("MCC")
              & model_process == "BM"
-             & !(methodMod %in% c(
-               "limma", "DESeq2",
-               "limma_trend", "limma_trend_cor"))
+             & methodMod %in% c("limma_cor",
+                                "phylolm_BM", "phylolm_OU")
 )
 df <- subset(df, use_lengths ==  "with_lengths" & tree_type == "real_tree")
 df$methodMod <- factor(df$methodMod,
@@ -597,3 +594,53 @@ pb <- ggplot(aa) +
 aligned_plots <- align_plots(pp, pb, align="hv", axis="tlr")
 ggdraw(aligned_plots[[1]]) + draw_plot(aligned_plots[[2]])
 # colorblindr::cvd_grid(ggdraw(aligned_plots[[1]]) + draw_plot(aligned_plots[[2]]))
+
+###############################################################################
+## Figure : SVA nsv
+################################################################################
+df <- subset(result.table,
+             lengthNormalization %in% c("TPM", "length")
+             & transformation %in% c("log2", "none")
+             & effect_size == base_effect_size
+             & fact_disp == base_fact_disp
+             & prop_var_tree == base_prop_var_tree
+             & model_process == "BM"
+             # & score %in% c(plot_scores, "nsv")
+             & methodMod %in% c(
+               "limma_sva_auto")
+)
+df <- subset(df, use_lengths ==  "with_lengths" & tree_type == "real_tree")
+df$methodMod <- factor(df$methodMod, levels = c("DESeq2", "limma", "limma_cor", "limma_sva_one", "limma_sva_auto", "phylolm_BM",
+                                                "phylolm_OU"
+))
+levels(df$methodMod) <- pretty_names[levels(df$methodMod)]
+
+if (nrow(df) / 3 / 1 / 1 != Nrep) message("wrong dimension")
+if (anyNA(df$score_value)) message("Some NAs in the data")
+
+dodge <- position_dodge(width = 0)
+
+ggplot(df, aes(x = Design, y = nsv, fill = Design, color = Design)) +
+  # geom_hline(aes(yintercept = threshold), linetype = 2) +
+  # facet_grid(nsv ~ ., scales = "free", switch = "y", labeller = labeller(Design = label_both)) +
+  geom_boxplot(alpha = 0.3, outlier.size = 0.5, show.legend = FALSE,
+               position = "identity", width = 0.2,
+               aes(group = interaction(effect_size, use_lengths, tree_type, methodMod, prop_var_tree, Design))) +
+  stat_summary(fun = median, geom = 'line', aes(group = interaction(effect_size, use_lengths, tree_type, prop_var_tree), colour = Design)) +
+  scale_fill_viridis_d(guide = "none", end = 0.8, option = "B") +
+  scale_colour_viridis_d(end = 0.8, option = "B", guide ="none") +
+  ylab("Number of Surogate Variables") +
+  # xlab("") +
+  theme_bw() +
+  theme(text = element_text(size = 9),
+        title = element_text(size = 7),
+        # strip.text.y = element_text(vjust = -3),
+        # strip.placement = "outside",
+        # strip.background = element_blank(),
+        # panel.grid.minor = element_blank(),
+        legend.position = c(0.01, 0.01),
+        legend.justification = c("left", "bottom"),
+        legend.key.size = unit(10, 'pt'),
+        # plot.margin = unit(c(0.1,0.1,-0.31,-0.5), "cm")
+  )
+# colorblindr::cvd_grid
diff --git a/R_scripts/07_data_analysis.R b/R_scripts/07_data_analysis.R
@@ -287,6 +287,22 @@ diff_002
 match_002 <- nrow(na.omit(limma_stern_002)) - nrow(na.omit(limma_stern))
 match_002
 
+##################################################################
+## RPKM
+##################################################################
+## Limma Cor RPKM
+th <- 0.05
+results_rpkm_cor <- readRDS(here(results_directory, "stern2018_cpd_lengthNorm.limma.RPKM.log2.with_trend.with_blocks.rds"))
+results_rpkm_cor <- subset(results_rpkm_cor@result.table, adjpvalue <= th)
+results_rpkm_cor$Orthogroup <- rownames(results_rpkm_cor)
+results_rpkm_cor$Uniprot.top.hit <- all_ogs[match(results_rpkm_cor$Orthogroup, all_ogs$Orthogroup), "Uniprot.top.hit"]
+results_rpkm_cor
+# Match Stern - Limma Cor
+rpkm_stern <- results_stern[match(results_rpkm_cor$Orthogroup, results_stern$Orthogroup), -2]
+rpkm_stern
+limma_rpkm <- results_limma_cor[match(results_rpkm_cor$Orthogroup, results_limma_cor$Orthogroup), -2]
+limma_rpkm
+
 # BEGIN DO NOT RUN 
 ####################################################################
 ## Limma by clades - Analysis