#!/usr/bin/env Rscript library("dplyr") # read the gene clusters summary table which is the output of # anvi-summarize command run prior: gene_clusters_df <- read.table(file = 'Wolbachia-Pan/SUMMARY/Wolbachia-Pan_gene_clusters_summary.txt', header = TRUE, sep = "\t", quote = "") # remove singleton gene clusters unique to wPip since they have no coverage values gene_clusters_df_of_wPip <- gene_clusters_df[gene_clusters_df$genome_name == "wPip", ] # read the blast output for best WO hits among wPip genes WO_in_wPip_df <- read.table(file = 'WO_in_wPip_best_hit.txt', header = TRUE, sep = "\t", quote = "") # learn which gene cluster ids corrpespond to wPip gene calls gene_clusters_df_of_wPip_phages=left_join(gene_clusters_df_of_wPip, WO_in_wPip_df, by="gene_callers_id") # keep only columns of interest gene_clusters_df_of_wPip_phages_short <- gene_clusters_df_of_wPip_phages[, colnames(gene_clusters_df_of_wPip_phages) %in% c('WO_assignment', 'pct_alignment', 'gene_cluster_id')] # now we need to generate coverage values for each gene cluster. # O07 had the most number of gene clusters, presenting a meaningful # target to have coverage values for as many gene clusters as possible. # here, we first get the gene clusters for O07 gene_clusters_df_of_O07 <- gene_clusters_df[gene_clusters_df$genome_name == "Wolbachia_O07", ] # then remove all columns but two gene_clusters_df_of_O07 <- gene_clusters_df_of_O07[, colnames(gene_clusters_df_of_O07) %in% c('gene_cluster_id', 'gene_callers_id')] # learn gene coverages for all four Wolbachia genomes in the O07 metagenome using # the summary of the merged profile database for O07: gene_coverages_df_of_O07 <- read.table(file='06_MERGED/Culex_O07/SUMMARY/bin_by_bin/Wolbachia/Wolbachia-gene_coverages.txt', header=TRUE, sep="\t", quote="") # anvi'o adds ribosomal RNA genes ad hoc after HMM runs, so they appear among gene calls but not in the # pangenome which only uses ORFs identified through prodigal. we will remove them bofore continuing: missing_genes <- setdiff(gene_coverages_df_of_O07$gene_callers_id, gene_clusters_df_of_O07$gene_callers_id) gene_coverages_df_of_O07 <- gene_coverages_df_of_O07[!(gene_coverages_df_of_O07$gene_callers_id %in% missing_genes), ] # now we can merge the two; gene_clusters_df_of_O07_with_covs=full_join(gene_clusters_df_of_O07, gene_coverages_df_of_O07, by="gene_callers_id") # the following takes the mean coverage of genes per gene cluster Mean_coverage_gene_clusters_df_of_O07 <- gene_clusters_df_of_O07_with_covs %>% group_by(gene_cluster_id) %>% summarize(mean(Wolbachia_O03), mean(Wolbachia_O07), mean(Wolbachia_O11), mean(Wolbachia_O12)) # Join WO assignment and Coverage so that we get GC common to wPip and # O07 with 4 additional layers gene_clusters_df <- full_join(gene_clusters_df_of_wPip_phages_short, Mean_coverage_gene_clusters_df_of_O07, by="gene_cluster_id") # give better names for each column names(gene_clusters_df) <- c('gene_cluster_id', 'WO_assignment', 'pct_alignment', 'cov_in_003', 'cov_in_007', 'cov_in_011', 'cov_in_012') # Here we have a data frame that lists for each gene cluster (1) the mean coverage values # per gene per genome, (2) WO assignments if genes match to wPip prophages, and (3) the percent # alignment of the query gene seqeunce so we can filter out weak hits. the following # section of the code works on this data frame to produce a final additional item # data file to be incorporated into the pan genome. # a better max function that the default max function. just ignore: better_max <- function(x) ifelse( !all(is.na(x)), max(x, na.rm=T), NA) output_df <- data.frame(gene_cluster_id=character(), WO_assignmnet=character(), pct_alignment=numeric(), cov_in_003=numeric(), cov_in_007=numeric(), cov_in_011=numeric(), cov_in_012=numeric(), stringsAsFactors=FALSE) # here we go through every gene cluster, pick the most frequent WO assignment if there are any, # and then discard the ones resulted from hits where less than 90% of the query sequence # were aligned N = 1 for (gene_cluster in levels(gene_clusters_df$gene_cluster_id)){ # gene_cluster <- "GC_00001082" df <- gene_clusters_df[gene_clusters_df$gene_cluster_id == gene_cluster, ] freqs <- as.data.frame(table(df$WO_assignment)) freqs <- freqs[freqs$Freq > 0, ] most_frequent_assignment <- as.character(freqs[order(c(-freqs$Freq)), ]$Var1[1]) best_pct_alignment_score <- better_max(df$pct_alignment) # remove weak WO assignments if (!is.na(df$pct_alignment[1])){ if (df$pct_alignment[1] < 90){ most_frequent_assignment <- NA; } } output_df[N, ] <- c(gene_cluster, most_frequent_assignment, df$pct_alignment[1], df$cov_in_003[1], df$cov_in_007[1], df$cov_in_011[1], df$cov_in_012[1]) N = N + 1 } # store the output data frame as a TAB delimited file so we can import it back # into the anvi'o additional item data table write.table(output_df, "gene_clusters_additional_data.txt", quote=FALSE, sep="\t", na="", row.names=FALSE)