#!/usr/bin/env python3
# -*- coding: utf-8 -*-


import pandas as pd
import os
import sys
import argparse
import gzip
from Bio import SeqIO,SeqRecord
import re

"""
ambiguous base
    N Any nucleotide (A or C or G or T or U)
    R Purine (A or G)
    Y Pyrimidine (T or C)
    K Keto (G or T)
    M Amino (A or C)
    S Strong interaction (3 H bonds) (G or C)
    W Weak interaction (2 H bonds) (A or T)
    B Not A (C or G or T)
    D Not C (A or G or T)
    H Not G (A or C or T)
    V Not T or U (A or C or G)
"""

def genome_length(fasta, ambiguous = False):
    length = 0
    GC_content = {"G":0,"C":0}
    if ambiguous:
        bases_l = ["A","T","N","R","Y","K","M","S","W","B","D","H","V"]
    else:
        bases_l = ["A","T"]
    
    if fasta.endswith(".gz"):
        records = SeqIO.parse(gzip.open(fasta,"rt"),"fasta")
    else:
        records = SeqIO.parse(fasta,"fasta")

    for record in records:
        seq = record.seq.upper()
        GC_content["G"]+=seq.count("G")
        GC_content["C"]+=seq.count("C")
        for base in bases_l:
            length += seq.count(base)    
    length += (GC_content["C"]+GC_content["G"])   
    return length, ((GC_content["C"]+GC_content["G"])/length * 100)



if __name__ == "__main__":
    try :
        path2fasta = str(snakemake.params.genomes_dir)
        outfile = str(snakemake.output)
        ambiguous_base = snakemake.params.ambiguous
        sep = "\t"
    except:
        parser = argparse.ArgumentParser(
            prog='genomes_length',
            description='')
        parser.add_argument(
            'bin', type=str,
            help='(required) bin file or bins directory (fasta format) ')
        parser.add_argument(
            '--sep', type=str,
            default = "\t",
            help='(optional) contigs file separator)')
        parser.add_argument(
            '-o','--out', type=str,
            default=sys.stdout,
            help='(optional) Output directory where filtered bin will be saved, default stdout')
        parser.add_argument(
            '-a','--ambiguous', type=str, default=None,
            help='(if <bin> is a directory) , directory where bin statistics and plot will be saved (length, #contigs ...)')
        args = parser.parse_args()

        path2fasta=args.bin
        outfile = args.out
        sep = args.sep
        ambiguous_base = args.ambiguous

    
    
    if os.path.isdir(path2fasta):
        files =  [os.path.join(path2fasta,genome) for genome in os.listdir(path2fasta) if genome.endswith(".fa")]
    else:
        files=[path2fasta]
    
    data = {}
    for f in files:
        bin_id = re.split('.fa|.fasta',os.path.basename(f))[0]
        length,gc=genome_length(f)
        data.update({bin_id:{"length":length,"gc_content":format(gc, '.2f')}})
    
    df = pd.DataFrame(data).transpose()
    df.to_csv(outfile,header=True,index=True,sep=sep,index_label="genomes")
    
    