##### # Test of Essentiality # This script runs the test of essentiality for genes of hpara in absolute condition # Return value is table identifying essential genes using a Monte Carlo simulation-based approach for mariner transposons that insert at TA-sites # This script was inspired by the codes from the Whitley Lab (https://github.com/WhiteleyLab/Tn-seq/blob/master/TnSeqDESeq2Essential_mariner.R) # Date: Oct 12, 2021 # Author: Thais Harder de Palma ##### ##### # Preamble # Set WD with location of data setwd("/data/mramseylab/TnSeq_THP/test_duplicates/data") # load packages library(tidyverse) library(DESeq2) ### Set up output file (txt) # Parameters filenames = dir(pattern = "monoculture_aero-sites.txt") # Find file names conditions = sub("-sites.txt", "", filenames) # Extract file names core gff_pfx = "hpara" # Genome #set the name of your condition test_pfx = gff_pfx #set the number of replicates you have test_reps = 2 #set the number of pseudodatasets to generate num_expected = 1000 #set the output file prefix out_pfx = paste(test_pfx, "Essential_1000", sep = "_") #set the number of sites with the most reads to "trim" (remove) from the data to_trim = 50 #can put the "conditions" here, see variable above, the order does not matter in_files = conditions # set path for output folder path = '/data/mramseylab/TnSeq_THP/test_duplicates/output/mono_aero_duplicates/' # Initialize output file write(out_pfx, file = paste(path, paste(out_pfx, "_stats.txt", sep=""), sep="")) # this creates a txt file, with the info from the next lines of code that start with "write" write(paste( "\n", "input files:", sep=""), file = paste(path, paste(out_pfx, "_stats.txt", sep=""), sep=""), append = TRUE) write(in_files, file = paste(path, paste(out_pfx, "_stats.txt", sep=""), sep=""), append = TRUE) write(paste( "\n", "gff: ", gff_pfx, sep=""), file = paste(path, paste(out_pfx, "_stats.txt", sep=""), sep=""), append = TRUE) write(paste( "\n", "number of replicates: ", test_reps, sep=""), file = paste(path, paste(out_pfx, "_stats.txt", sep=""), sep=""), append = TRUE) write(paste("\n", "to trim: ", to_trim, sep=""), file = paste(path, paste(out_pfx, "_stats.txt", sep=""), sep=""), append = TRUE) write(paste("number of expected datasets: ", num_expected, sep=""), file = paste(path, paste(out_pfx, "_stats.txt", sep=""), sep=""), append = TRUE) ### Load sites files sites = data.frame(Pos=c(0)) for (i in 1:length(in_files)) { newsites = read.table(paste(paste(in_files[i], sep="/"), "sites.txt", sep="-")) colnames(newsites) = c(paste("V", i, sep=""), "Pos") newsites = tail(newsites, n=-to_trim) newsites = arrange(newsites,Pos) sites = merge(sites, newsites, all=T) } sites = tail(sites, n=-1) # Delete position 0 sites[is.na(sites)] = 0 # Fill NA with 0 ##### # Prepare data # LOESS smooth data - corrects for how multifork replication can inflate the abundance of insertions close to the origin of replication (see Narayanan et al. 2017, p.12). for (i in 2:(length(sites))) { counts.loess <- loess(sites[[i]] ~ sites$Pos, span=1, data.frame(x=sites$Pos, y=sites[[i]]), control=loess.control(statistics=c("approximate"),trace.hat=c("approximate"))) counts.predict <- predict(counts.loess, data.frame(x=sites$Pos)) counts.ratio <- counts.predict/median(counts.predict) sites[[i]] <- sites[[i]]/counts.ratio } # output number of total and shared sites across samples. Need to edit based on number of samples. sites_total = sites %>% mutate(numreps = rowSums(select(., contains("V")))) %>% filter(numreps >= 1) write(paste("number of sites:", nrow(sites_total)), file = paste(path, paste(out_pfx, "_stats.txt", sep=""), sep=""), append = TRUE) sites_shared = sites %>% mutate(numreps = rowSums(select(., contains("V")))) %>% filter(numreps >= 2) write(paste("number of sites shared by at least 2 samples:", nrow(sites_shared)), file = paste(path, paste(out_pfx, "_stats.txt", sep=""), sep=""), append = TRUE) sites_all = sites %>% mutate(numreps = rowSums(select(., contains("V")))) %>% filter(numreps >= 3) write(paste("number of sites in all three samples:", nrow(sites_all)), file = paste(path, paste(out_pfx, "_stats.txt", sep=""), sep=""), append = TRUE) # Normalize data by reads/site colData = data.frame(c(rep(test_pfx, test_reps)), condition = rep("untreated", test_reps)) sitescds = sites[,2:length(sites)] %>% round %>% DESeqDataSetFromMatrix(colData = colData, design= ~ 1) sitescds = estimateSizeFactors(sitescds) #Output the normalized counts counts.norm = counts(sitescds, normalized=F) rownames(counts.norm) = sites$Pos # Initialize the list of genes, determine genome length #can add column names if your gff has extra columns gff = read.delim(file= '/data/mramseylab/TnSeq_THP/test_duplicates/data/hpara.gff', sep="\t", fill=TRUE, header=F, col.names = c("seqname", "source", "feature", "start", "end", "score", "strand", "frame", "att")) print(head(gff)) gff = tail(gff, n=-1) # after n, put - the number of rows with header gff = gff[(gff$feature=="CDS"),] # Keep only observations of CDS gff = gff %>% mutate(range = end - start, end = round(end - range*0.1, 0)) %>% select(-range) # Removes the 50 most abundant insertion sites (correct for amplification bias) print(head(gff)) # Initialize read counts per gene and number of independent Tn sites per gene # Generate pseudo-datasets with the same number of insertion sites and total reads mapping to those sites, randomly distributed across the genome at TA sites print("Generating pseudo-datasets") counts.df = data.frame(counts.norm) counts.df$Pos = as.numeric(rownames(counts.df)) counts.df = arrange(counts.df, Pos) numreads = sum(counts.norm)/test_reps numsites = length(which(counts.norm>0))/test_reps #generates a new sites data frame and calculates the mean counts for each site in a new column. This column (containing the mean) is then used to sample from to generate the expected datasets. sites2 = sites[2:(test_reps+1)] sites2$mean =rowMeans(sites2) %>% round Possible_sites = read.csv("/data/mramseylab/TnSeq_THP/test_duplicates/data/hpara_TAsites_absolute.csv") #possible TA insertion sites for mariner transposon #compute expected dataset for (i in 1:num_expected) { print(i) expected = data.frame(Pos=sample(Possible_sites$sites, round(numsites,0), replace = T), Exp=sample(sites2$mean, round(numsites,0), replace = F)) %>% arrange(Pos) %>% group_by(Pos) %>% mutate(obs = row_number()) %>% filter(obs < 2) %>% ungroup() %>% select(-obs) #may need to change column name depending on sites file colnames(expected)[2] = paste("Expected", i, sep=".") counts.df = left_join(counts.df, expected) %>% arrange(Pos) counts.df[is.na(counts.df)] = 0 } rownames(counts.df) = counts.df$Pos # check counts.norm = as.matrix(counts.df[,(1:(length(counts.df)-1))]) write(paste("\n", "numreads: ", numreads, sep=""), file = paste(path, paste(out_pfx, "_stats.txt", sep=""), sep=""), append = TRUE) write(paste("numsites: ", numsites, sep=""), file = paste(path, paste(out_pfx, "_stats.txt", sep=""), sep=""), append = TRUE) # Initialize the lists of read counts per gene and number of independent Tn sites per gene controlreps = 0 expreps = 0 for (c in 1:length(counts.norm[1,])) { gff[,c+9] = rep(1,length(gff[,1])) if (controlreps < test_reps) { controlreps = controlreps + 1 colnames(gff)[c+9] = paste(test_pfx, controlreps, sep=".") } else { expreps = expreps + 1 colnames(gff)[c+9] = paste("Expected", expreps, sep=".") } } # Output gene boundaries and read counts per Tn site for Perl binning script print("Binning read counts by gene boundaries") boundariesfile = paste(path, paste(out_pfx, ".boundaries.tsv", sep=""), sep="") sitecountsfile = paste(path, paste(out_pfx, ".sitecounts.tsv", sep=""), sep="") write.table(gff[,c(4,5, 10:length(gff))], boundariesfile, quote=FALSE, sep="\t", row.names=F) write.table(counts.df, sitecountsfile, quote=FALSE, sep="\t", row.names=F) # Bin sites into genes key = gff[,c(4,5)] numsites = NA genecounts = NA for (i in 1:nrow(key)) { tocount = counts.df %>% filter(Pos >= key$start[i] & Pos <= key$end[i]) counts = tocount %>% summarise(across(-Pos, sum)) %>% mutate(across(everything(), ~ifelse(.x == 0, 1, .x))) site = tocount %>% mutate(across(-Pos, ~ifelse(.x > 0, 1, 0))) %>% summarise(across(-Pos, sum)) genecounts = rbind(genecounts, counts) numsites = rbind(numsites, site) } genecounts = genecounts[-1,] numsites = numsites[-1,] genecounts2 = cbind(key, genecounts) genes = data.frame(id = rep("", length(gff[,1]), stringsAsFactors = FALSE)) genes$id = as.character(genes$id) for (i in 1:length(gff[,1])) { genes$id[i] = strsplit(grep("ID",strsplit(as.character(gff$att[i]),";")[[1]], value=T),"=")[[1]][2] genes$name[i] = strsplit(grep("Name",strsplit(as.character(gff$att[i]),";")[[1]], value=T),"=")[[1]][2] } #write.table(genecounts2, paste(path, paste(out_pfx, ".genecounts2.tsv", sep=""), sep=""), quote=FALSE, sep="\t", row.names=FALSE) # COmmented for now because too large # Perform differential fitness analysis colnames(numsites) = colnames(gff)[10:length(gff)] #change this number based on number of columns in gff numsitesout = data.frame(numsites[,(1:test_reps)]) numsitesout[,test_reps+1] = rowMeans(numsites[,-(1:test_reps)]) colnames(numsitesout)[test_reps+1] = "Expected_numsites" colData = data.frame(c(rep(test_pfx, test_reps), rep("Expected", num_expected)), condition = c(rep(test_pfx, test_reps),rep("Expected", num_expected))) numcountsout = data.frame(genecounts2[,1:(test_reps+2)]) numcountsout[,test_reps+3] = rowMeans(genecounts2[,-(1:(test_reps+2))]) colnames(numcountsout)[test_reps+3] = "Expected_numreads" ### betaPrior=FALSE #When betaPrior=FALSE in DESeq2 step "nbinomWaldTest," log2fold changes are NOT shrunk towards zero, even when counts are low, disperson is high, or degrees of freedom is low. #We consider betaPrior=FALSE as the less conservative analysis. It is more likely to identify genes with a large l2fc simply because the data is noisy genescds_F = DESeqDataSetFromMatrix(countData = round(genecounts), colData = colData[1:(num_expected+test_reps),], design = ~ condition) genescds_F = estimateSizeFactors(genescds_F) genescds_F = estimateDispersions(genescds_F) #can set fitType="local" if getting flag at this step for some samples genescds_F = nbinomWaldTest(genescds_F, betaPrior=FALSE) res_F = results(genescds_F, cooksCutoff = FALSE, contrast = c("condition", test_pfx, "Expected")) print(head(res_F)) out_F = cbind(res_F, genes$id, genes$name, numcountsout, numsitesout) colnames(out_F)[2] = "log2FoldChange_FALSE" colnames(out_F)[3] = "lfcSE_FALSE" colnames(out_F)[4] = "stat_FALSE" colnames(out_F)[5] = "pvalue_FALSE" colnames(out_F)[6] = "padj_FALSE" colnames(out_F)[7] = "id" colnames(out_F)[8] = 'name' #Count number of significant genes; can adjust pvalue and padj here based on output(s) that you are interested in write(paste("\n", "Analyses with betaPrior = FALSE (no l2fc shrinkage):", sep=""), file = paste(path, paste(out_pfx, "_stats.txt", sep=""), sep=""), append = TRUE) write(paste(paste("\n", "number of genes with pvalue <= 0.01 in FALSE:", sep=""), sum(res_F$pvalue<=.01, na.rm=TRUE)), file = paste(path, paste(out_pfx, "_stats.txt", sep=""), sep=""), append = TRUE) write(paste("number of genes with padj <= 0.01 in FALSE:", sum(res_F$padj<=.01, na.rm=TRUE)), file = paste(path, paste(out_pfx, "_stats.txt", sep=""), sep=""), append = TRUE) # Perform bimodal clustering and essentiality calling and output results library(mclust) fit_F = Mclust(out_F$log2FoldChange_FALSE, G=1:2, modelNames = "V") #Assume variable variance of clusters summary(fit_F, parameters = TRUE) category_F = rep("",length(out_F$id)) for (i in 1:length(out_F$id)) { if (fit_F$classification[i] == 1 & out_F$log2FoldChange_FALSE[i] < 0) { category_F[i] = "Reduced" } else { category_F[i] = "Unchanged" } } density_F = densityMclust(out_F$log2FoldChange_FALSE, G=1:2, modelNames = "V") #Assume variable variance of clusters summary(density_F) pdf(paste(path, paste(out_pfx, "FALSE_histogram.pdf", sep="_"), sep=""), width = 10, height = 4) par(mfrow = c(1,2)) plot(density_F, what = "density", data = out_F$log2FoldChange_FALSE, breaks=100) plot(fit_F, what = "uncertainty") dev.off() #Count number of unchanged and reduced genes, l2fc means, and l2fc variances write(paste(paste("\n", "Mclust Unchanged FALSE:", sep=""), sum(category_F=="Unchanged")), file = paste(path, paste(out_pfx, "_stats.txt", sep=""), sep=""), append = TRUE) write(paste("Mclust Reduced FALSE:", sum(category_F=="Reduced")), file = paste(path, paste(out_pfx, "_stats.txt", sep=""), sep=""), append = TRUE) write(paste("\n", "Mclust_reduced_mean_False: ", fit_F$parameters$mean[1], sep =""), file = paste(path, paste(out_pfx, "_stats.txt", sep=""), sep=""), append = TRUE) write(paste("Mclust_unchanged_mean_False: ", fit_F$parameters$mean[2], sep =""), file = paste(path, paste(out_pfx, "_stats.txt", sep=""), sep=""), append = TRUE) write(paste(paste("\n", "Mclust_reduced_variance_False: ", sep=""), fit_F$parameters$variance$sigmasq[1], sep =""), file = paste(path, paste(out_pfx, "_stats.txt", sep=""), sep=""), append = TRUE) write(paste("Mclust_unchanged_variance_False: ", fit_F$parameters$variance$sigmasq[2], sep =""), file = paste(path, paste(out_pfx, "_stats.txt", sep=""), sep=""), append = TRUE) fit_F$uncertainty[which(out_F$log2FoldChange_FALSE > 0)] = 0 print(head(category_F, 10)) essentiality_F = as.data.frame(cbind(category_F, fit_F$uncertainty)) colnames(essentiality_F) = c("Essentiality_FALSE", "Uncertainty_FALSE") out_F = cbind(out_F, essentiality_F) #Count number of significant genes with reduced Mclust; can adjust padj (or pvalue) and uncertaintly values here based on output(s) that you are interested in write(paste("\n", "Reduced and Uncertainty <=0.01 if FALSE:", sum(out_F$Essentiality_FALSE=="Reduced" & as.numeric(as.character(out_F$Uncertainty_FALSE))<=0.01)), file = paste(path, paste(out_pfx, "_stats.txt", sep=""), sep=""), append = TRUE) write(paste("Reduced and Uncertainty <=0.01 and padj <=0.01 if FALSE:", sum(out_F$Essentiality_FALSE=="Reduced" & out_F$padj_FALSE<=.01 & as.numeric(as.character(out_F$Uncertainty_FALSE))<=0.01, na.rm=TRUE)), file = paste(path, paste(out_pfx, "_stats.txt", sep=""), sep=""), append = TRUE) write(paste("\n", "Reduced and Uncertainty <=0.05 if FALSE:", sum(out_F$Essentiality_FALSE=="Reduced" & as.numeric(as.character(out_F$Uncertainty_FALSE))<=0.05)), file = paste(path, paste(out_pfx, "_stats.txt", sep=""), sep=""), append = TRUE) write(paste("Reduced and Uncertainty <=0.05 and padj <=0.05 if FALSE:", sum(out_F$Essentiality_FALSE=="Reduced" & out_F$padj_FALSE<=.05 & as.numeric(as.character(out_F$Uncertainty_FALSE))<=0.05, na.rm=TRUE)), file = paste(path, paste(out_pfx, "_stats.txt", sep=""), sep=""), append = TRUE) write.csv(out_F, file=paste(path, paste(out_pfx, ".FALSE.DESeq.csv", sep=""), sep=""), row.names=T) ############################ #betaPrior=TRUE #When betaPrior=TRUE in DESeq2 step "nbinomWaldTest," log2fold changes ARE shrunk towards zero when counts are low, disperson is high, or degrees of freedom is low. #We consider betaPrior=TRUE as the more conservative analysis. It is less likely to identify genes with a large l2fc simply because the data is noisy #betaPrior=TRUE is also the legacy DESeq2 option prior to version 1.16 (November 2016) genescds_T = DESeqDataSetFromMatrix(countData = round(genecounts), colData = colData[1:(num_expected + test_reps),], design = ~ condition) genescds_T = estimateSizeFactors(genescds_T) genescds_T = estimateDispersions(genescds_T, fitType="local") #can set fitType="local" if getting flag at this step for some samples genescds_T = nbinomWaldTest(genescds_T, betaPrior=TRUE) res_T = results(genescds_T, cooksCutoff = FALSE, contrast = c("condition", test_pfx, "Expected")) print(head(res_T)) out_T = cbind(res_T, genes$id, genes$name, numcountsout, numsitesout) colnames(out_T)[2] = "log2FoldChange_TRUE" colnames(out_T)[3] = "lfcSE_TRUE" colnames(out_T)[4] = "stat_TRUE" colnames(out_T)[5] = "pvalue_TRUE" colnames(out_T)[6] = "padj_TRUE" colnames(out_T)[7] = "id" colnames(out_T)[8] = 'name' #Count number of significant genes; can adjust pvalue and padj here based on output(s) that you are interested in write(paste("\n", "Analyses with betaPrior = TRUE (l2fc shrinkage):", sep=""), file = paste(path, paste(out_pfx, "_stats.txt", sep=""), sep=""), append = TRUE) write(paste(paste("\n", "number of genes with pvalue <= 0.01 in TRUE:", sep=""), sum(res_T$pvalue<=.01, na.rm=TRUE)), file = paste(path, paste(out_pfx, "_stats.txt", sep=""), sep=""), append = TRUE) write(paste("number of genes with padj <= 0.01 in TRUE:", sum(res_T$padj<=.01, na.rm=TRUE)), file = paste(path, paste(out_pfx, "_stats.txt", sep=""), sep=""), append = TRUE) # Perform bimodal clustering and essentiality calling and output results library(mclust) fit_T = Mclust(out_T$log2FoldChange_TRUE, G=1:2, modelNames = "V") #Assume variable variance of clusters summary(fit_T, parameters = TRUE) category_T = rep("",length(out_T$id)) for (i in 1:length(out_T$id)) { if (fit_T$classification[i] == 1 & out_T$log2FoldChange_TRUE[i] < 0) { category_T[i] = "Reduced" } else { category_T[i] = "Unchanged" } } density_T = densityMclust(out_T$log2FoldChange_TRUE, G=1:2, modelNames = "V") #Assume variable variance of clusters summary(density_T) pdf(paste(path, paste(out_pfx, "TRUE_histogram.pdf", sep="_"), sep=""), width = 10, height = 4) par(mfrow = c(1,2)) plot(density_T, what = "density", data = out_T$log2FoldChange_TRUE, breaks=100) plot(fit_T, what = "uncertainty") dev.off() #Count number of unchanged and reduced genes, l2fc means, and l2fc variances write(paste(paste("\n", "Mclust Unchanged TRUE:", sep=""), sum(category_T=="Unchanged")), file = paste(path, paste(out_pfx, "_stats.txt", sep=""), sep=""), append = TRUE) write(paste("Mclust Reduced TRUE:", sum(category_T=="Reduced")), file = paste(path, paste(out_pfx, "_stats.txt", sep=""), sep=""), append = TRUE) write(paste(paste("\n","Mclust_reduced_mean_TRUE: ", sep=""), fit_T$parameters$mean[1], sep =""), file = paste(path, paste(out_pfx, "_stats.txt", sep=""), sep=""), append = TRUE) write(paste("Mclust_unchanged_mean_TRUE: ", fit_T$parameters$mean[2], sep =""), file = paste(path, paste(out_pfx, "_stats.txt", sep=""), sep=""), append = TRUE) write(paste(paste("\n", "Mclust_reduced_variance_TRUE: ", sep=""), fit_T$parameters$variance$sigmasq[1], sep =""), file = paste(path, paste(out_pfx, "_stats.txt", sep=""), sep=""), append = TRUE) write(paste("Mclust_unchanged_variance_TRUE: ", fit_T$parameters$variance$sigmasq[2], sep =""), file = paste(path, paste(out_pfx, "_stats.txt", sep=""), sep=""), append = TRUE) fit_T$uncertainty[which(out_T$log2FoldChange_TRUE > 0)] = 0 print(head(category_T, 10)) essentiality_T = as.data.frame(cbind(category_T, fit_T$uncertainty)) colnames(essentiality_T) = c("Essentiality_TRUE", "Uncertainty_TRUE") out_T = cbind(out_T, essentiality_T) #Count number of significant genes with reduced Mclust; can adjust padj (or pvalue) and uncertaintly values here based on output(s) that you are interested in write(paste("\n", "Reduced and Uncertainty <=0.01 if TRUE:", sum(out_T$Essentiality_TRUE=="Reduced" & as.numeric(as.character(out_T$Uncertainty_TRUE))<=0.01)), file = paste(path, paste(out_pfx, "_stats.txt", sep=""), sep=""), append = TRUE) write(paste("Reduced and Uncertainty <=0.01 and padj <=0.01 if TRUE:", sum(out_T$Essentiality_TRUE=="Reduced" & out_T$padj_TRUE<=.01 & as.numeric(as.character(out_T$Uncertainty_TRUE))<=0.01, na.rm=TRUE)), file = paste(path, paste(out_pfx, "_stats.txt", sep=""), sep=""), append = TRUE) write(paste("\n", "Reduced and Uncertainty <=0.05 if TRUE:", sum(out_T$Essentiality_TRUE=="Reduced" & as.numeric(as.character(out_T$Uncertainty_TRUE))<=0.05)), file = paste(path, paste(out_pfx, "_stats.txt", sep=""), sep=""), append = TRUE) write(paste("Reduced and Uncertainty <=0.05 and padj <=0.05 if TRUE:", sum(out_T$Essentiality_TRUE=="Reduced" & out_T$padj_TRUE<=.05 & as.numeric(as.character(out_T$Uncertainty_TRUE))<=0.05, na.rm=TRUE)), file = paste(path, paste(out_pfx, "_stats.txt", sep=""), sep=""), append = TRUE) write.csv(out_T, file=paste(path, paste(out_pfx, ".TRUE.DESeq.csv", sep=""), sep=""), row.names=T) #Combine betaPrior=TRUE and betaPrior=FALSE data out_combo = cbind(out_F[,1:6], out_F[,(test_reps+test_reps+12):(test_reps+test_reps+14)], out_T[,1:6], out_T[,(test_reps+test_reps+12):(test_reps+test_reps+14)], out_T[,7:(test_reps+test_reps+11)]) write.csv(out_combo, file=paste(path, paste(out_pfx, ".combo.DESeq_duplicates.csv", sep=""), sep=""), row.names=T)