--- title: "Endoderm_TC_Technical_Factor_Analysis" author: "Lauren Blake" date: "January 12, 2017" output: html_document --- The goal of this is to establish which, if any, technical factors are correlated with our biological variables of interest. # PART ONE: See if any of the variables for RNA-Seq correlate with the expression PCs for genes (63 samples) ## Initialization ```{r} # Load libraries library("gdata") library("ggplot2") library("qvalue") library("glmnet") source("~/Desktop/Endoderm_TC/ashlar-trial/analysis/chunk-options.R") # Load cpm data cpm_in_cutoff <- read.delim("../data/cpm_cyclicloess.txt") # Load sample information After_removal_sample_info <- read.csv("~/Desktop/Endoderm_TC/After_removal_sample_info.csv") Species <- After_removal_sample_info$Species species <- After_removal_sample_info$Species day <- After_removal_sample_info$Day individual <- After_removal_sample_info$Individual Sample_ID <- After_removal_sample_info$Sample_ID labels <- paste(Sample_ID, day, sep=" ") # Load technical factor information RNA_seq_info_all <- read.csv("../data/Endo_TC_Data_Share_Sorting.csv", header = T) dim(RNA_seq_info_all) RNA_seq_info <- as.data.frame(cbind(RNA_seq_info_all[1:63, 4], RNA_seq_info_all[1:63, 3], RNA_seq_info_all[1:63, 5:27], RNA_seq_info_all[1:63, 30:35], RNA_seq_info_all[1:63, 37:43])) # Remove library well (only 1/well) RNA_seq_info <- RNA_seq_info[,-20] # Full data set dim(RNA_seq_info) ``` ## Obtain gene expression PCs ```{r} # PCs pca_genes <- prcomp(t(cpm_in_cutoff), scale = T, retx = TRUE, center = TRUE) matrixpca <- pca_genes$x pc1 <- matrixpca[,1] pc2 <- matrixpca[,2] pc3 <- matrixpca[,3] pc4 <- matrixpca[,4] pc5 <- matrixpca[,5] pcs <- data.frame(pc1, pc2, pc3, pc4, pc5) summary <- summary(pca_genes) ``` ## Plots for PCs versus technical factors ```{r} #Create plots for each of the possible confounders versus PCs 1-5 pdf('../data/VarVsGenePCs.pdf') for (i in 2:length(RNA_seq_info)) { par(mfrow=c(1,5)) plot(RNA_seq_info[,i], pcs[,1], ylab = "PC1", xlab = " ") plot(RNA_seq_info[,i], pcs[,2], ylab = "PC2", xlab = " ") plot(RNA_seq_info[,i], pcs[,3], ylab = "PC3", xlab = " ") plot(RNA_seq_info[,i], pcs[,4], ylab = "PC4", xlab = " ") plot(RNA_seq_info[,i], pcs[,5], ylab = "PC5", xlab = " ") title(xlab = substitute(paste(k), list(k=colnames(RNA_seq_info)[i])), outer = TRUE, line = -2) mtext(substitute(paste('PCs vs. ', k), list(k=colnames(RNA_seq_info)[i])), side = 3, line = -2, outer = TRUE) } dev.off() ``` ## Testing association between a particular variable and PCs with a linear model ```{r} # TESTING BIOLOGICAL VARIABLES OF INTEREST PC_pvalues_day = matrix(data = NA, nrow = 5, ncol = 1, dimnames = list(c("PC1", "PC2", "PC3", "PC4", "PC5"), c("Day"))) for(i in 1:5){ # PC versus day checkPC1 <- lm(pcs[,i] ~ as.factor(day)) #Get the summary statistics from it summary(checkPC1) #Get the p-value of the F-statistic summary(checkPC1)$fstatistic fstat <- as.data.frame(summary(checkPC1)$fstatistic) p_fstat <- 1-pf(fstat[1,], fstat[2,], fstat[3,]) PC_pvalues_day[i,1] <- p_fstat #Fraction of the variance explained by the model r2_value <- summary(checkPC1)$r.squared } # PC versus species PC_pvalues_species = matrix(data = NA, nrow = 5, ncol = 1, dimnames = list(c("PC1", "PC2", "PC3", "PC4", "PC5"), c("Species"))) for(i in 1:5){ # PC versus species checkPC1 <- lm(pcs[,i] ~ as.factor(species)) #Get the summary statistics from it summary(checkPC1) #Get the p-value of the F-statistic summary(checkPC1)$fstatistic fstat <- as.data.frame(summary(checkPC1)$fstatistic) p_fstat <- 1-pf(fstat[1,], fstat[2,], fstat[3,]) PC_pvalues_species [i,1] <- p_fstat #Fraction of the variance explained by the model r2_value <- summary(checkPC1)$r.squared } # TESTING TECHNICAL VARIABLES OF INTEREST #Make an empty matrix to put all of the data in # Note: Do not include TC day, as it is a biological variable of interest PC_pvalues = matrix(data = NA, nrow = 5, ncol = 35, dimnames = list(c("PC1", "PC2", "PC3", "PC4", "PC5"), c("Cell line", "Batch", "Sex", "Passage at seed", "Start date", "Density at seed", "Harvest time", "Harvest density", "Purity", "Max purity", "RNA Extraction Date", "RNA conc", "RIN", "260 280 RNA", "260 230 RNA", "DNA concentration", "Library prep batch", "Library concentration", "uL sample", "uL EB", "Index sequence", "Seq pool", "Lane r1", "Mseqs R1", "Total lane reads 1", "Lane prop r1", "Dup r1", "GC r1", "Lane r2", "Mseqs r2", "Total lane reads r2", "Lane prop r2", "Dups r2", "GC r2", "Total reads"))) PC_r2 = matrix(data = NA, nrow = 5, ncol = 35, dimnames = list(c("PC1", "PC2", "PC3", "PC4", "PC5"), c("Cell line", "Batch", "Sex", "Passage at seed", "Start date", "Density at seed", "Harvest time", "Harvest density", "Purity", "Max purity", "RNA Extraction Date", "RNA conc", "RIN", "260 280 RNA", "260 230 RNA", "DNA concentration", "Library prep batch", "Library concentration", "uL sample", "uL EB", "Index sequence", "Seq pool", "Lane r1", "Mseqs R1", "Total lane reads 1", "Lane prop r1", "Dup r1", "GC r1", "Lane r2", "Mseqs r2", "Total lane reads r2", "Lane prop r2", "Dups r2", "GC r2", "Total reads"))) # Take out day and species (factors of interest) RNA_seq_info_35 <- RNA_seq_info[,-(1:2)] numerical_tech_factors <- c(4, 6,8:10, 12:16,18:20,24:28,30:35) categorical_tech_factors <- c(1:3, 5,7,11,17,21:23,29) #Check lm to see how well the variables containing numerical data are correlated with a PC #For PCs 1-5 j=1 for (i in numerical_tech_factors){ for (j in 1:5){ checkPC1 <- lm(pcs[,j] ~ RNA_seq_info_35[,i]) #Get the summary statistics from it summary(checkPC1) #Get the p-value of the F-statistic summary(checkPC1)$fstatistic fstat <- as.data.frame(summary(checkPC1)$fstatistic) p_fstat <- 1-pf(fstat[1,], fstat[2,], fstat[3,]) #Fraction of the variance explained by the model r2_value <- summary(checkPC1)$r.squared #Put the summary statistics into the matrix w PC_pvalues[j, i] <- p_fstat PC_r2[j, i] <- r2_value } } #Check lm to see how well the variables containing ordinal data are correlated with a PC #For PCs 1-5 j=1 for (i in categorical_tech_factors){ for (j in 1:5){ checkPC1 <- lm(pcs[,j] ~ as.factor(RNA_seq_info_35[,i])) #Get the summary statistics from it summary(checkPC1) #Get the p-value of the F-statistic summary(checkPC1)$fstatistic fstat <- as.data.frame(summary(checkPC1)$fstatistic) p_fstat <- 1-pf(fstat[1,], fstat[2,], fstat[3,]) #Fraction of the variance explained by the model r2_value <- summary(checkPC1)$r.squared #Put the summary statistics into the matrix w PC_pvalues[j, i] <- p_fstat PC_r2[j, i] <- r2_value } } #write.table(PC_pvalues, "/Users/laurenblake/Desktop/pc_pvalues.txt") ``` ## Test for potential violations of the assumptions of the linear model Note: I learned in http://lauren-blake.github.io/Reg_Evo_Primates/analysis/Tech_factor_analysis1_gene_exp.html that this doesn't work when one or more cells in a column contains an "NA". ## Find which factors are statistically significant ```{r} #Distribution of p-values adjusted by FDR not including species or tissue fdr_val = p.adjust(PC_pvalues, method = "fdr", n = length(PC_pvalues)) fdr_val_order = fdr_val[order(fdr_val)] hist(fdr_val_order, ylab = "BH-adjusted p-values", main = "Distribution of Benjamini and Hochberg adjusted p-values", breaks = 10) # Number of values significant at 10% FDR fdr_val <- matrix(fdr_val, nrow = 5, ncol = 35) matrix_fdr_val = matrix(data = fdr_val, nrow = 5, ncol = 35, dimnames = list(c("PC1", "PC2", "PC3", "PC4", "PC5"), c("Cell line", "Batch", "Sex", "Passage at seed", "Start date", "Density at seed", "Harvest time", "Harvest density", "High conf purity", "Max purity", "RNA Extraction Date", "RNA conc", "RIN", "260 280 RNA", "260 230 RNA", "DNA concentration", "Library prep batch", "Library concentration", "uL sample", "uL EB", "Index sequence", "Seq pool", "Lane r1", "Mseqs R1", "Total lane reads 1", "Lane prop r1", "Dup r1", "GC r1", "Lane r2", "Mseqs r2", "Total lane reads r2", "Lane prop r2", "Dups r2", "GC r2", "Total reads"))) #write.table(matrix_fdr_val, "/Users/laurenblake/Desktop/pc_pvalues.txt") # Number of values significant at 10% FDR sum(matrix_fdr_val <= 0.1) #Get the coordinates of which variables/PC combinations are significant at FDR 10% TorF_matrix_fdr <- matrix_fdr_val <=0.1 coor_to_check <- which(matrix_fdr_val <= 0.1, arr.ind=T) coor_to_check <- as.data.frame(coor_to_check) # Number of variables significant at 10% FDR (note: off by 1 from column number in RNA_seq_info file; see notes in Part Two) coor_to_check_col <- coor_to_check$col unique(coor_to_check_col) ``` ** Conclusions from Part I** The following variables are associated with one of the PCs tested and will be investigated further in Part 2. * Cell line * Batch * Passage at seed * Start date * Harvest time * Harvest density * Purity * Max purity * RNA Extraction Date * RNA conc * RIN * 260 280 RNA * Library prep batch * Index sequence * Seq pool * Lane r1 * Total lane reads 1 * Lane r2 # PART TWO: For the variable(s) that correlate, see if these segregate with either species or tissue In coor_to_check_col, row is the PC # and col is the column # -1 that is associated with the PC. Want to take the coor_to_check_col column # and add one. ```{r} var_numb = unique(coor_to_check_col) var.numb <- as.data.frame(var_numb) for (i in var.numb[1:nrow(var.numb),]) { par(mfrow=c(1,2)) plot(day, RNA_seq_info[,i], xlab = "Species", ylab = substitute(paste(k), list(k=colnames(RNA_seq_info)[i]))) plot(Species, RNA_seq_info[,i], xlab = "Tissue", ylab = substitute(paste(k), list(k=colnames(RNA_seq_info)[i]))) mtext(substitute(paste(k, ' vs. Species and Tissue'), list(k=colnames(RNA_seq_info)[i])), side = 3, line = -2, outer = TRUE) } dev.off() ``` ## Testing to see if differences across variable groups are statistically significant for day. ```{r} # Technical factors to be tested numerical_prev_sign_tech_factors <- c(4, 8:10, 12:14,25) categorical_prev_sign_tech_factors <- c(1,2,5,7,11, 17,21:23,29) #Make a matrix to store the p-values pvalues_day = matrix(data = NA, nrow = 1, ncol = 35, dimnames = list(c("p-value"), c("Cell line", "SPSX", "Batch", "Passage at seed", "Start date", "Density at seed", "Harvest time", "Harvest density", "High conf purity", "Max purity", "RNA Extraction Date", "RNA conc", "RIN", "260 280 RNA", "260 230 RNA", "DNA concentration", "Library prep batch", "Library concentration", "uL sample", "uL EB", "Index sequence", "Seq pool", "Lane r1", "Mseqs R1", "Total lane reads 1", "Lane prop r1", "Dup r1", "GC r1", "Lane r2", "Mseqs r2", "Total lane reads r2", "Lane prop r2", "Dups r2", "GC r2", "Total reads"))) # Numerical #Performing this test of significance for variables that are numerical data (Using an ANOVA) j=1 for (i in numerical_prev_sign_tech_factors) { summary_anova = summary(aov(RNA_seq_info_35[,i]~ as.factor(day))) data_summary_anova <- as.data.frame(summary_anova[[1]]$`Pr(>F)`) p_val_anova <- data_summary_anova[1,] pvalues_day[, i] <- p_val_anova j=j+1 } # Ordinal #Performing this test of significance for variables that are categorical data (Using Pearson's chi-squared test) j=1 for (i in categorical_prev_sign_tech_factors) { pval_chi <- chisq.test(as.factor(RNA_seq_info_35[,i]), as.factor(day), simulate.p.value = TRUE)$p.value pvalues_day[, i] <- pval_chi j=j+1 } ``` ## Testing to see if differences across variable groups are statistically significant for species. ```{r} # Technical factors to be tested numerical_prev_sign_tech_factors <- c(4, 8:10, 12:14,25) categorical_prev_sign_tech_factors <- c(1,2,5,7,11, 17,21:23,29) #Make a matrix to store the p-values pvalues_species = matrix(data = NA, nrow = 1, ncol = 35, dimnames = list(c("p-value"), c("Cell line", "Batch", "Sex", "Passage at seed", "Start date", "Density at seed", "Harvest time", "Harvest density", "High conf purity", "Max purity", "RNA Extraction Date", "RNA conc", "RIN", "260 280 RNA", "260 230 RNA", "DNA concentration", "Library prep batch", "Library concentration", "uL sample", "uL EB", "Index sequence", "Seq pool", "Lane r1", "Mseqs R1", "Total lane reads 1", "Lane prop r1", "Dup r1", "GC r1", "Lane r2", "Mseqs r2", "Total lane reads r2", "Lane prop r2", "Dups r2", "GC r2", "Total reads"))) # Numerical #Performing this test of significance for variables that are numerical data (Using an ANOVA. Note: in this case, the p-value of the ANOVA is the same as the p-value of the beta1 coefficient in lm) j=1 for (i in numerical_prev_sign_tech_factors) { summary_anova = summary(aov(RNA_seq_info_35[,i]~ as.factor(Species))) data_summary_anova <- as.data.frame(summary_anova[[1]]$`Pr(>F)`) p_val_anova <- data_summary_anova[1,] pvalues_species[, i] <- p_val_anova j=j+1 } # Ordinal #Performing this test of significance for variables that are categorical data (Using Pearson's chi-squared test) j=1 for (i in categorical_prev_sign_tech_factors) { pval_chi <- chisq.test(as.factor(RNA_seq_info_35[,i]), as.factor(Species), simulate.p.value = TRUE)$p.value pvalues_species[, i] <- pval_chi j=j+1 } # Combine tables collapse_table_full <- rbind(pvalues_day, pvalues_species) # Want only NAs collapse_table <- collapse_table_full[, colSums(is.na(collapse_table_full)) != nrow(collapse_table_full)] #write.table(collapse_table, "/Users/laurenblake/Desktop/collapse_table.txt") ``` ```{r} # Harvest density and Species summary(lm(RNA_seq_info$Harvest_density~ as.factor(Species))) # Harvest density and Day summary(lm(RNA_seq_info$Harvest_density~ as.factor(day))) chisq.test(as.factor(RNA_seq_info$Harvest_density), as.factor(day), simulate.p.value = TRUE) ``` ## Find factors with FDR < 0.1 ```{r} #Calculate q-values (FDR = 10%) fdr_val = p.adjust(collapse_table, method = "fdr", n = length(collapse_table)*2) fdr_val_order = fdr_val[order(fdr_val)] hist(fdr_val_order, ylab = "BH-adjusted p-values", main = "Distribution of Benjamini and Hochberg adjusted p-values", breaks = 10) collapse_table_fdr_val = matrix(data = fdr_val, nrow = 2, ncol = nrow(var.numb), dimnames = list(c("Day", "Species"), colnames(collapse_table))) collapse_table_fdr_val write.table(collapse_table_fdr_val, "/Users/laurenblake/Desktop/collapse_table_fdr.txt") ``` ** Conclusions from Part 2 ** The following variables are confounded with day: * Harvest time (BH adjusted p-value = 0.011994) * Harvest density (BH adjusted p-value = 0.02174627) * Index sequence (BH adjusted p-value = 0.011994) The following variables are confounded with species: * Cell line (BH adjusted p-value = 0.011994) * Harvest density (BH adjusted p-value = 0.03481949) * Purity (BH adjusted p-value = 0.08328609) * Max. purity (BH adjusted p-value = 0.03033897) ```{r} plot(day, as.factor(RNA_seq_info[,2]), main = "Cell line versus day") plot(Species, as.factor(RNA_seq_info[,2]), main = "Cell line versus species (sign.)") plot(day, as.factor(RNA_seq_info[,3]), main = "SPSX versus day") plot(Species, as.factor(RNA_seq_info[,3]), main = "SPSX versus species (sign.)") plot(day, as.factor(RNA_seq_info[,8]), main = "Harvest time versus day (sign.)") plot(Species, RNA_seq_info[,8], main = "Harvest time versus species") plot(as.factor(day), RNA_seq_info[,9], main = "Harvest density versus day (sign.)", ylab = "Harvest density", xlab = "Day") plot(Species, RNA_seq_info[,9], main = "Harvest density versus species (sign.)", ylab = "Harvest density", xlab = "Species") plot(as.factor(day), RNA_seq_info[,10], main = "High confidence purity versus day", ylab = "High confidence purity", xlab = "Day") plot(Species, RNA_seq_info[,10], main = "High confidence purity versus species", ylab = "High confidence purity", xlab = "Species") plot(as.factor(day), RNA_seq_info[,11], main = "Max. purity versus day", ylab = "Max. purity", xlab = "Day") plot(Species, RNA_seq_info[,11], main = "Max. purity versus species (sign.)", ylab = "Max. purity", xlab = "Species") plot(as.factor(day), as.factor(RNA_seq_info[,22]), main = "Index sequence versus day (sign.)") plot(Species, RNA_seq_info[,22], main = "Index sequence versus species") ``` ### Combine and test day versus adapter for each sequencing pool ```{r} # Make table of day, adaptor, and sequencing pool day_index_pool <- cbind(day, RNA_seq_info$Index_seq, RNA_seq_info$Seq_pool) # Test dependency with the adaptors by day # Sequencing pool 1 adaptors_pool1 <- day_index_pool[ which(day_index_pool[,3]==1),] chisq.test(as.factor(adaptors_pool1[,2]), as.factor(adaptors_pool1[,1]), simulate.p.value = TRUE)$p.value #fisher.test(adaptors_pool1[,1:2], simulate.p.value=TRUE)$p.value #fisher.test(adaptors_pool2[,2:1], simulate.p.value=TRUE)$p.value #fisher.test(adaptors_pool3[,2:1], simulate.p.value=TRUE)$p.value #fisher.test(adaptors_pool4[,2:1], simulate.p.value=TRUE)$p.value # p-value = 1 # Sequencing pool 2 adaptors_pool2 <- day_index_pool[ which(day_index_pool[,3]==2),] chisq.test(as.factor(adaptors_pool2[,2]), as.factor(adaptors_pool2[,1]), simulate.p.value = TRUE)$p.value # p-value = 1 # Sequencing pool 3 adaptors_pool3 <- day_index_pool[ which(day_index_pool[,3]==3),] chisq.test(as.factor(adaptors_pool3[,2]), as.factor(adaptors_pool3[,1]), simulate.p.value = TRUE)$p.value # p-value = 1 # Sequencing pool 4 adaptors_pool4 <- day_index_pool[ which(day_index_pool[,3]==4),] chisq.test(as.factor(adaptors_pool4[,2]), as.factor(adaptors_pool4[,1]), simulate.p.value = TRUE)$p.value # p-value = 1 fdr_val_4= p.adjust(c(1,1,1,1), method = "fdr") fdr_val_4 ``` # PART THREE: Which variables to consider putting in the model? ### RIN Score ```{r} # Make a table with species, day, and RIN score species <- After_removal_sample_info$Species day <- After_removal_sample_info$Day iPSC_prop <- as.data.frame(cbind(day, species, RNA_seq_info[,14]), stringsAsFactors = FALSE) # Remove the 1 sample with a missing RIN score remove_NA <- c(26) iPSC_prop <- iPSC_prop[-remove_NA, ] colnames(iPSC_prop) <- c("Day", "Species", "RIN_Score") iPSC_prop$Species[iPSC_prop$Species == "2"] <- "Human" iPSC_prop$Species[iPSC_prop$Species == "1"] <- "Chimp" iPSC_prop$Day[iPSC_prop$Day == "0"] <- "Day 0" iPSC_prop$Day[iPSC_prop$Day == "1"] <- "Day 1" iPSC_prop$Day[iPSC_prop$Day == "2"] <- "Day 2" iPSC_prop$Day[iPSC_prop$Day == "3"] <- "Day 3" ggplot(data = iPSC_prop, aes(y = RIN_Score, x = as.factor(Species))) + facet_wrap(~ Day, nrow = 1) + geom_boxplot() + geom_point(size = 5, position=position_jitter(width=0.2, height=0.1), show.legend = FALSE) + labs(x = "Species", y = "RIN score") + theme_bw() ggplot(data = iPSC_prop, aes(y = RIN_Score, x = as.factor(Species))) + facet_wrap(~ Day, nrow = 1) + geom_boxplot() + geom_dotplot(binaxis='y', stackdir='center', binwidth = 0.05) + labs(x = "Species", y = "RIN score") + theme_bw() ``` ### Passage at seed ```{r} # Make a table with species, day, and RIN score species <- After_removal_sample_info$Species day <- After_removal_sample_info$Day iPSC_prop <- as.data.frame(cbind(day, species, RNA_seq_info[1:63,35]), stringsAsFactors = FALSE) # Remove the 1 sample with a missing RIN score #remove_NA <- c(26) #iPSC_prop <- iPSC_prop[-remove_NA, ] colnames(iPSC_prop) <- c("Day", "Species", "RIN_Score") iPSC_prop$Species[iPSC_prop$Species == "2"] <- "Human" iPSC_prop$Species[iPSC_prop$Species == "1"] <- "Chimp" iPSC_prop$Day[iPSC_prop$Day == "0"] <- "Day 0" iPSC_prop$Day[iPSC_prop$Day == "1"] <- "Day 1" iPSC_prop$Day[iPSC_prop$Day == "2"] <- "Day 2" iPSC_prop$Day[iPSC_prop$Day == "3"] <- "Day 3" ggplot(data = iPSC_prop, aes(y = RIN_Score, x = as.factor(Species))) + facet_wrap(~ Day, nrow = 1) + geom_boxplot() + geom_point(size = 5, position=position_jitter(width=0.2, height=0.1), show.legend = FALSE) + labs(x = "Species", y = "Density at seeding") + theme_bw() ggplot(data = iPSC_prop, aes(y = RIN_Score, x = as.factor(Species))) + facet_wrap(~ Day, nrow = 1) + geom_boxplot() + geom_dotplot(binaxis='y', stackdir='center', binwidth = 0.001) + labs(x = "Species", y = "Density at seeding") + theme_bw() ``` ### Harvest time and harvest densities (supplement) ```{r} iPSC_prop <- as.data.frame(cbind(day, species, RNA_seq_info[,9], as.character(RNA_seq_info[,8])), stringsAsFactors = FALSE) colnames(iPSC_prop) <- c("Day", "Species", "Harvest_density", "Harvest_time") iPSC_prop$Species[iPSC_prop$Species == "2"] <- "Human" iPSC_prop$Species[iPSC_prop$Species == "1"] <- "Chimp" iPSC_prop$Day[iPSC_prop$Day == "0"] <- "Day 0" iPSC_prop$Day[iPSC_prop$Day == "1"] <- "Day 1" iPSC_prop$Day[iPSC_prop$Day == "2"] <- "Day 2" iPSC_prop$Day[iPSC_prop$Day == "3"] <- "Day 3" iPSC_prop$Harvest_density <- as.numeric(iPSC_prop$Harvest_density) ggplot(data = iPSC_prop, aes(y = Harvest_density, x = as.factor(Day))) + facet_wrap(~ Species, nrow = 1) + geom_boxplot() + geom_point(aes(color = as.factor(Species)), size = 5, position=position_jitter(width=0.2, height=0.1), show.legend = FALSE) ggplot(data = iPSC_prop, aes(y = Harvest_density, x = as.factor(Species))) + facet_wrap(~ Day, nrow = 1) + geom_boxplot() # Harvest density (supplement) ggplot(data = iPSC_prop, aes(y = Harvest_density, x = Species)) + facet_wrap(~ Day, nrow = 1) + geom_boxplot() + geom_dotplot(binaxis='y', stackdir='center') + labs(y = "Harvest density", title = "Harvest densities for each sample") + theme_bw() ggplot(data = iPSC_prop, aes(y = Harvest_density, x = Species)) + facet_wrap(~ Day, nrow = 1) + geom_boxplot() + geom_dotplot(binaxis='y', stackdir='center', dotsize = 0.75) + labs(y = "Harvest density", title = "Harvest densities for each sample") + theme_bw() # Harvest density for each day species median(iPSC_prop[which(iPSC_prop$Day == "Day 0" & iPSC_prop$Species == "human") , 3]) median(iPSC_prop[which(iPSC_prop$Day == "Day 0" & iPSC_prop$Species == "chimp") , 3]) median(iPSC_prop[which(iPSC_prop$Day == "Day 1" & iPSC_prop$Species == "human") , 3]) median(iPSC_prop[which(iPSC_prop$Day == "Day 1" & iPSC_prop$Species == "chimp") , 3]) median(iPSC_prop[which(iPSC_prop$Day == "Day 2" & iPSC_prop$Species == "human") , 3]) median(iPSC_prop[which(iPSC_prop$Day == "Day 2" & iPSC_prop$Species == "chimp") , 3]) median(iPSC_prop[which(iPSC_prop$Day == "Day 3" & iPSC_prop$Species == "human") , 3]) median(iPSC_prop[which(iPSC_prop$Day == "Day 3" & iPSC_prop$Species == "chimp") , 3]) # Harvest time species <- After_removal_sample_info$Species day <- After_removal_sample_info$Day iPSC_prop <- as.data.frame(cbind(day, species, RNA_seq_info[,9], as.character(RNA_seq_info[,8])), stringsAsFactors = FALSE) colnames(iPSC_prop) <- c("Day", "Species", "Harvest_density", "Harvest_time") iPSC_prop$Species[iPSC_prop$Species == "2"] <- "Human" iPSC_prop$Species[iPSC_prop$Species == "1"] <- "Chimp" iPSC_prop$Day[iPSC_prop$Day == "0"] <- "Day 0" iPSC_prop$Day[iPSC_prop$Day == "1"] <- "Day 1" iPSC_prop$Day[iPSC_prop$Day == "2"] <- "Day 2" iPSC_prop$Day[iPSC_prop$Day == "3"] <- "Day 3" table(iPSC_prop$Day, iPSC_prop$Harvest_time) ggplot(data = iPSC_prop, aes(y = as.factor(Harvest_time), x = Species)) + facet_wrap(~ Day, nrow = 1) + geom_point(aes(color = Species), size = 5, position=position_jitter(width=0.2, height=0.1), show.legend = FALSE) + theme_bw() iPSC_prop[which(iPSC_prop$Day == "Day 0" & iPSC_prop$Species == "chimp") , 4] d0c <- table(factor(iPSC_prop[which(iPSC_prop$Day == "Day 0" & iPSC_prop$Species == "chimp") , 4], levels = c("11:56", "12:10", "12:18", "12:30", "1:00 PM", "1:05 PM", "1:06 PM", "1:08 PM", "1:17 PM", "1:20 PM", "1:30 PM"))) iPSC_prop[which(iPSC_prop$Day == "Day 0" & iPSC_prop$Species == "human") , 4] d0h <- table(factor(iPSC_prop[which(iPSC_prop$Day == "Day 0" & iPSC_prop$Species == "human") , 4], levels = c("11:56", "12:10", "12:18", "12:30", "1:00 PM", "1:05 PM", "1:06 PM", "1:08 PM", "1:17 PM", "1:20 PM", "1:30 PM"))) iPSC_prop[which(iPSC_prop$Day == "Day 1" & iPSC_prop$Species == "chimp") , 4] d1c <- table(factor(iPSC_prop[which(iPSC_prop$Day == "Day 1" & iPSC_prop$Species == "chimp") , 4], levels = c("11:56", "12:10", "12:18", "12:30", "1:00 PM", "1:05 PM", "1:06 PM", "1:08 PM", "1:17 PM", "1:20 PM", "1:30 PM"))) iPSC_prop[which(iPSC_prop$Day == "Day 1" & iPSC_prop$Species == "human") , 4] d1h <- table(factor(iPSC_prop[which(iPSC_prop$Day == "Day 1" & iPSC_prop$Species == "human") , 4], levels = c("11:56", "12:10", "12:18", "12:30", "1:00 PM", "1:05 PM", "1:06 PM", "1:08 PM", "1:17 PM", "1:20 PM", "1:30 PM"))) iPSC_prop[which(iPSC_prop$Day == "Day 2" & iPSC_prop$Species == "chimp") , 4] d2c <- table(factor(iPSC_prop[which(iPSC_prop$Day == "Day 2" & iPSC_prop$Species == "chimp") , 4], levels = c("11:56", "12:10", "12:18", "12:30", "1:00 PM", "1:05 PM", "1:06 PM", "1:08 PM", "1:17 PM", "1:20 PM", "1:30 PM"))) iPSC_prop[which(iPSC_prop$Day == "Day 2" & iPSC_prop$Species == "human") , 4] d2h <- table(factor(iPSC_prop[which(iPSC_prop$Day == "Day 2" & iPSC_prop$Species == "human") , 4], levels = c("11:56", "12:10", "12:18", "12:30", "1:00 PM", "1:05 PM", "1:06 PM", "1:08 PM", "1:17 PM", "1:20 PM", "1:30 PM"))) iPSC_prop[which(iPSC_prop$Day == "Day 3" & iPSC_prop$Species == "chimp") , 4] d3c <- table(factor(iPSC_prop[which(iPSC_prop$Day == "Day 3" & iPSC_prop$Species == "chimp") , 4], levels = c("11:56", "12:10", "12:18", "12:30", "1:00 PM", "1:05 PM", "1:06 PM", "1:08 PM", "1:17 PM", "1:20 PM", "1:30 PM"))) iPSC_prop[which(iPSC_prop$Day == "Day 3" & iPSC_prop$Species == "human") , 4] d3h <- table(factor(iPSC_prop[which(iPSC_prop$Day == "Day 3" & iPSC_prop$Species == "human") , 4], levels = c("11:56", "12:10", "12:18", "12:30", "1:00 PM", "1:05 PM", "1:06 PM", "1:08 PM", "1:17 PM", "1:20 PM", "1:30 PM"))) #Check that the frequencies match the input (n = 59) frequency_time <- as.data.frame(cbind(d0c, d0h, d1c, d1h, d2c, d2h, d3c, d3h)) frequency_time plot(frequency_time[,1]) rowSums(frequency_time) # Order correctly iPSC_prop$Harvest_time[iPSC_prop$Harvest_time == "11:56"] <- "11:56 AM" iPSC_prop$Harvest_time[iPSC_prop$Harvest_time == "12:10"] <- "12:10 PM" iPSC_prop$Harvest_time[iPSC_prop$Harvest_time == "12:18"] <- "12:18 PM" iPSC_prop$Harvest_time[iPSC_prop$Harvest_time == "12:30"] <- "12:30 PM" lv <- c("1:30 PM", "1:20 PM", "1:17 PM", "1:08 PM", "1:06 PM", "1:05 PM", "1:00 PM", "12:30 PM", "12:18 PM", "12:10 PM", "11:56 AM") x <- factor(iPSC_prop$Harvest_time,levels = lv) # Make plot ggplot(data = iPSC_prop, aes(y = as.factor(x), x = as.factor(Species))) + facet_wrap(~ Day, nrow = 1) + geom_dotplot(binaxis='y', stackdir='center', binwidth = 0.2) + labs(x = "Species", y = "Harvest time") + theme_bw() ``` ### Purity ```{r} # Make a table with species, day, and RIN score species <- After_removal_sample_info$Species day <- After_removal_sample_info$Day iPSC_prop <- as.data.frame(cbind(day, species, RNA_seq_info[,10]), stringsAsFactors = FALSE) # Remove the samples with missing purity iPSC_prop_purity <- iPSC_prop[complete.cases(iPSC_prop) == TRUE, ] colnames(iPSC_prop_purity) <- c("Day", "Species", "Purity") iPSC_prop_purity$Species[iPSC_prop_purity$Species == "2"] <- "Human" iPSC_prop_purity$Species[iPSC_prop_purity$Species == "1"] <- "Chimp" iPSC_prop_purity$Day iPSC_prop_purity$Day[iPSC_prop_purity$Day == "0"] <- "Day 0" iPSC_prop_purity$Day[iPSC_prop_purity$Day == "1"] <- "Day 1" iPSC_prop_purity$Day[iPSC_prop_purity$Day == "2"] <- "Day 2" iPSC_prop_purity$Day[iPSC_prop_purity$Day == "3"] <- "Day 3" ggplot(data = iPSC_prop_purity, aes(y = Purity, x = as.factor(Species))) + facet_wrap(~ Day, nrow = 1) + geom_boxplot() + geom_point(size = 5, position=position_jitter(width=0.2, height=0.1), show.legend = FALSE) + labs(x = "Species", y = "Purity") + theme_bw() ggplot(data = iPSC_prop_purity, aes(y = Purity, x = as.factor(Species))) + facet_wrap(~ Day, nrow = 1) + geom_boxplot() + geom_dotplot(binaxis='y', stackdir='center', binwidth = .02) + labs(x = "Species", y = "Purity") + theme_bw() ``` # PART FOUR: See if any of the variables for RNA-Seq correlate with the expression PCs for genes (40 samples) ## Initialization ```{r} # Load cpm data cpm_in_cutoff_40 <- read.delim("../data/cpm_cyclicloess_40.txt") # Load sample information bio_rep_samplefactors <- read.delim("../data/samplefactors-filtered.txt", stringsAsFactors=FALSE) day <- bio_rep_samplefactors$Day species <- bio_rep_samplefactors$Species ``` ## Obtain gene expression PCs (40 samples) ```{r} # PCs pca_genes <- prcomp(t(cpm_in_cutoff_40), scale = T, retx = TRUE, center = TRUE) matrixpca <- pca_genes$x pc1 <- matrixpca[,1] pc2 <- matrixpca[,2] pc3 <- matrixpca[,3] pc4 <- matrixpca[,4] pc5 <- matrixpca[,5] pcs <- data.frame(pc1, pc2, pc3, pc4, pc5) summary <- summary(pca_genes) ``` ## Testing association between a particular variable and PCs with a linear model (40 samples) ```{r} # TESTING BIOLOGICAL VARIABLES OF INTEREST PC_pvalues_day = matrix(data = NA, nrow = 5, ncol = 1, dimnames = list(c("PC1", "PC2", "PC3", "PC4", "PC5"), c("Day"))) for(i in 1:5){ # PC versus day checkPC1 <- lm(pcs[,i] ~ as.factor(day)) #Get the summary statistics from it summary(checkPC1) #Get the p-value of the F-statistic summary(checkPC1)$fstatistic fstat <- as.data.frame(summary(checkPC1)$fstatistic) p_fstat <- 1-pf(fstat[1,], fstat[2,], fstat[3,]) PC_pvalues_day[i,1] <- p_fstat #Fraction of the variance explained by the model r2_value <- summary(checkPC1)$r.squared } # PC versus species PC_pvalues_species = matrix(data = NA, nrow = 5, ncol = 1, dimnames = list(c("PC1", "PC2", "PC3", "PC4", "PC5"), c("Species"))) for(i in 1:5){ # PC versus species checkPC1 <- lm(pcs[,i] ~ as.factor(species)) #Get the summary statistics from it summary(checkPC1) #Get the p-value of the F-statistic summary(checkPC1)$fstatistic fstat <- as.data.frame(summary(checkPC1)$fstatistic) p_fstat <- 1-pf(fstat[1,], fstat[2,], fstat[3,]) PC_pvalues_species [i,1] <- p_fstat #Fraction of the variance explained by the model r2_value <- summary(checkPC1)$r.squared } # Combine tables collapse_table_full <- rbind(PC_pvalues_day, PC_pvalues_species) #Calculate q-values (FDR = 10%) fdr_val = p.adjust(collapse_table_full, method = "fdr", n = length(collapse_table_full)*2) collapse_table_fdr_val = matrix(data = fdr_val, nrow = 5, ncol = 2, dimnames = list(c("PC1", "PC2", "PC3", "PC4", "PC5"), c("Day", "Species"))) collapse_table_fdr_val ```