#!/usr/bin/env Rscript

suppressMessages(library("dplyr"))

# read in the genome lenghts per Wolbachia genome
genome_lengths <- read.table(file = 'genome-lengths.txt',
                             header = TRUE,
                             sep = "\t",
                             quote = "")

# read in the concatenated variability profiles for each
# of the four metagenomes generated by the anvi'o program
# anvi-gen-variability-profile
variability <- read.table(file = 'variability-all.txt',
                          header = TRUE,
                          sep = "\t",
                          quote = "")

# keep only single-nucleotide variants that fall into the
# context of complete gene calls to avoid dubious contig
# ends where variants often emerge due to mappig artifacts
df <- variability[variability$in_complete_gene_call == "1" & 
                  variability$departure_from_consensus > 0.1, ]

# count how many SNVs you have per metagenome
num_snvs <- as.data.frame(table(df$sample_id))
names(num_snvs) <- c('metagenome', 'num_snvs')

# merge it with genome lengths
snvs=full_join(num_snvs, genome_lengths, by="metagenome")

# calculate the percent SNVs per metagenome
snvs$percent_snvs <- snvs$num_snvs / snvs$genome_length * 100

# add position info
snvs$in_first <- 0
snvs$in_second <- 0
snvs$in_third <- 0

for(mg in snvs$metagenome){
  arr <- df[df$sample_id == mg, ]$base_pos_in_codon
  snvs[snvs$metagenome == mg, ]$in_first <- length(arr[arr == 1]) * 100 / length(arr)
  snvs[snvs$metagenome == mg, ]$in_second <- length(arr[arr == 2]) * 100 / length(arr)
  snvs[snvs$metagenome == mg, ]$in_third <- length(arr[arr == 3]) * 100 / length(arr)
}

# print it nicely
cat(sprintf("
=======================================================================
Percent SNV density for each Wolbachia genome in metagenomes from which
they were recovered, the average coverage of nucleotide positions at which
SNVs were observed, and the distribution of SNVs across codon positions
=======================================================================\n"))
for(mg in snvs$metagenome){
  cat(sprintf("Wolbachia in %s: %.2f%% (%d SNVs with an avg. cov. of %dX;
                                      %.1f%% in 1st,
                                      %.1f%% in 2nd,
                                      %.1f%% in 3rd nt pos).\n\n", 
              mg,
              snvs[snvs$metagenome == mg, ]$percent_snvs,
              snvs[snvs$metagenome == mg, ]$num_snvs,
              round(mean(df[df$sample_id == mg, ]$coverage)),
              snvs[snvs$metagenome == mg, ]$in_first,
              snvs[snvs$metagenome == mg, ]$in_second,
              snvs[snvs$metagenome == mg, ]$in_third))
}