#!/usr/bin/env Rscript suppressMessages(library("dplyr")) # read in the genome lenghts per Wolbachia genome genome_lengths <- read.table(file = 'genome-lengths.txt', header = TRUE, sep = "\t", quote = "") # read in the concatenated variability profiles for each # of the four metagenomes generated by the anvi'o program # anvi-gen-variability-profile variability <- read.table(file = 'variability-all.txt', header = TRUE, sep = "\t", quote = "") # keep only single-nucleotide variants that fall into the # context of complete gene calls to avoid dubious contig # ends where variants often emerge due to mappig artifacts df <- variability[variability$in_complete_gene_call == "1" & variability$departure_from_consensus > 0.1, ] # count how many SNVs you have per metagenome num_snvs <- as.data.frame(table(df$sample_id)) names(num_snvs) <- c('metagenome', 'num_snvs') # merge it with genome lengths snvs=full_join(num_snvs, genome_lengths, by="metagenome") # calculate the percent SNVs per metagenome snvs$percent_snvs <- snvs$num_snvs / snvs$genome_length * 100 # add position info snvs$in_first <- 0 snvs$in_second <- 0 snvs$in_third <- 0 for(mg in snvs$metagenome){ arr <- df[df$sample_id == mg, ]$base_pos_in_codon snvs[snvs$metagenome == mg, ]$in_first <- length(arr[arr == 1]) * 100 / length(arr) snvs[snvs$metagenome == mg, ]$in_second <- length(arr[arr == 2]) * 100 / length(arr) snvs[snvs$metagenome == mg, ]$in_third <- length(arr[arr == 3]) * 100 / length(arr) } # print it nicely cat(sprintf(" ======================================================================= Percent SNV density for each Wolbachia genome in metagenomes from which they were recovered, the average coverage of nucleotide positions at which SNVs were observed, and the distribution of SNVs across codon positions =======================================================================\n")) for(mg in snvs$metagenome){ cat(sprintf("Wolbachia in %s: %.2f%% (%d SNVs with an avg. cov. of %dX; %.1f%% in 1st, %.1f%% in 2nd, %.1f%% in 3rd nt pos).\n\n", mg, snvs[snvs$metagenome == mg, ]$percent_snvs, snvs[snvs$metagenome == mg, ]$num_snvs, round(mean(df[df$sample_id == mg, ]$coverage)), snvs[snvs$metagenome == mg, ]$in_first, snvs[snvs$metagenome == mg, ]$in_second, snvs[snvs$metagenome == mg, ]$in_third)) }