configfile: "config/config.yaml"

rule all:
    input:
        "results/grafted.tre"

# Creates and populates a SQLite database with filtered sequence records.
# Uses BOLD dump TSV as defined in config file
rule create_database:
  input: config["file_names"]["bold_tsv"]
  output: 'results/databases/create_database.ok'
  params:
    db="results/databases/BOLD_{}_barcodes.db".format(config["marker"]),
    marker=config["marker"],
    minlength=config["minlength"],
    log_level=config['log_level']
  conda: "envs/create_database.yml"
  log: "logs/create_database.log"
  shell:
    """
    python workflow/scripts/bold_data_dump.py \
      -v {params.log_level} \
      -l {params.minlength} \
      -d {params.db} \
      -t {input} \
      -m {params.marker} 2> {log}
    touch {output}
    """

# Enriches the database with mappings to OpenToL. Because this operates on
# the same database file, the output is a 0-byte file `map_opentol.ok` to
# indicate that the task was run.
rule map_opentol:
  input: 'results/databases/create_database.ok'
  output: 'results/databases/map_opentol.ok'
  params:
    db="results/databases/BOLD_{}_barcodes.db".format(config["marker"]),
    marker=config['marker'],
    log_level=config['log_level']
  conda: "envs/map_opentol.yml"
  log: "logs/map_opentol.log"
  shell:
      """
      python workflow/scripts/map_opentol.py \
        -d {params.db} \
        -m {params.marker} \
        -v {params.log_level} \
        -o {output} 2> {log}
      """

# Creates and populates the OpenToL SQLite database. Uses the location of
# the OpenToL tree as per the config file. Merges the resulting database
# with the BOLD database (which gains a table `node`). SQLite can, at time
# of writing, not apply foreign key constraints retroactively. This is a
# shame because taxon.opentol_id implicitly references node.id.
rule megatree_loader:
    input:
        tree = config["file_names"]["open_tre"],
        mapping_ok = rules.map_opentol.output
    output: 'results/databases/megatree_loader.ok'
    conda: "envs/megatree_loader.yml"
    log: "logs/megatree_loader.log"
    params:
        db="results/databases/BOLD_{}_barcodes.db".format(config["marker"]),
        tempdb = temp("results/databases/opentree_nodes.db"),
        tempsql = temp("results/databases/node.sql")
    shell:
        """
        megatree-loader -i {input.tree} -d {params.tempdb} -v 2> {log}
        sqlite3 {params.tempdb} ".dump node" > {params.tempsql} 2>> {log}
        sqlite3 {params.db} < {params.tempsql} 2>> {log}
        touch {output} 2>> {log}
        rm {params.tempsql} {params.tempdb}
        """

# TODO either this is updated dynamically or scattergather is abandoned for dynamic()
scattergather:
  split = config["nfamilies"]

# Exports unaligned BIN sequences for each family within the total set. This task is parallelized as many times as
# specified by config["nfamilies"]. During this task, the longest raw sequence within each BIN is selected.
# TODO this should be modified to look at more criteria for reference sequence selection, as per BGE T10.1
rule family_fasta:
    input: rules.map_opentol.output
    output: scatter.split("results/fasta/family/{scatteritem}/unaligned.fa")
    params:
      log_level=config['log_level'],
      fasta_dir=config["file_names"]["fasta_dir"],
      filter_level=config["fasta_filter"]["filter_level"],
      filter_name=config["fasta_filter"]["filter_name"],
      db="results/databases/BOLD_{}_barcodes.db".format(config["marker"]),
      chunks=config["nfamilies"],
    conda: "envs/family_fasta.yml"
    log: "logs/family_fasta.log"
    shell:
        """
        python workflow/scripts/family_fasta.py \
            -d {params.db} \
            -f {params.fasta_dir} \
            -l {params.filter_level} \
            -n {params.filter_name} \
            -c {params.chunks} \
            -v {params.log_level} 2> {log}
        """

# Exports OpenToL newick file for each unaligned BIN sequence file. There are numerous issues to resolve here, including
# achieving greater coverage for the ingroup as per a recent email thread with the OpenToL PIs, which suggested a
# two-step process where a species is first expanded to its subspecies via the OpenToL taxonomy WS endpoint, which are
# stored in an intersection table in the database, and then collapsed again here. In addition, this constrains should
# (probably) also include the outgroup taxa. Alternatively, the branch lengths should be re-estimated after rooting.
# TODO improve OpenToL coverage and handle outgroups better
rule family_constraint:
    input:
        alignment = "results/fasta/family/{scatteritem}/unaligned.fa",
        megatree = 'results/databases/megatree_loader.ok'
    output: "results/fasta/family/{scatteritem}/unaligned.tre"
    params:
        temp=config["file_names"]["fasta_dir"] + "/{scatteritem}/unaligned.txt",
        db="results/databases/BOLD_{}_barcodes.db".format(config["marker"])
    conda: "envs/family_constraint.yml"
    log: "logs/family_constraint-{scatteritem}.log"
    shell:
        """
        grep '>' {input.alignment} | cut -f2 -d'|' | sort | uniq > {params.temp} 2> {log}
        megatree-pruner -i {params.temp} -d {params.db} -v > {output} 2>> {log}
        """

# Aligns sequences with HMM. Here, the sequences are also corrected for possible revcom issues. It is possible that this
# is not needed at all because so far 0 revcom sequences were observed in BOLD.
rule msa_hmm:
    input: "results/fasta/family/{scatteritem}/unaligned.fa"
    output: "results/fasta/family/{scatteritem}/aligned.fa"
    params:
        log_level=config['log_level'],
        hmm_file=config['file_names']['hmm'],
        db="results/databases/BOLD_{}_barcodes.db".format(config["marker"])
    conda: "envs/msa_hmm.yml"
    log: "logs/msa_hmm-{scatteritem}.log"
    shell:
        """
        python workflow/scripts/msa_hmm.py \
            -i {input} \
            -o {output} \
            -m {params.hmm_file} \
            -v {params.log_level} \
            -d {params.db} 2> {log}               
        """

# Prepares the aligned ingroup sequences and constraint tree for analysis by raxml-ng. This step also includes outgroup
# selection. A number of changes are needed here. Firstly, there are cases where the selected outgroup sequences yield
# results where the ingroup is not monophyletic with respect to the outgroups. This means that outgroup selection needs
# to be refined by considering all pairwise distances within the ingroup, and any outgroup must have a greater distance
# to the nearest ingroup taxon than the latter has to all other members of the ingroup. Secondly, the constraint tree
# should be generated here so that it also includes the outgroups, which means that an additional outgroup selection
# criterion should be that the candidate outgroups are in the OpenToL.
# TODO make sure outgroups are really outgroups and that they occur in the constraint tree
rule prep_raxml:
    input:
        alignment="results/fasta/family/{scatteritem}/aligned.fa",
        tree="results/fasta/family/{scatteritem}/unaligned.tre"
    output:
        alignment="results/fasta/family/{scatteritem}/raxml-ready.fa",
        tree="results/fasta/family/{scatteritem}/raxml-ready.tre"
    params:
        db="results/databases/BOLD_{}_barcodes.db".format(config["marker"]),
        log_level=config['log_level'],
        num_outgroups = config['outgroups']
    conda: "envs/prep_raxml.yml"
    log: "logs/prep_raxml-{scatteritem}.log"
    shell:
        """
        python workflow/scripts/prep_raxml.py \
            -t {input.tree} \
            -o {output.tree} \
            -a {input.alignment} \
            -f {output.alignment} \
            -v {params.log_level} \
            -n {params.num_outgroups} \
            -d {params.db} 2> {log}
        """

# Runs raxml as a constrained tree search. Possibly this should instead take a two-step approach, where unplaced
# sequences are first placed, and then the branch lengths are estimated. This requires that there actually is a
# constraint tree. Currently, this rule deals with the fact that some of the constraint trees are 0-byte files due
# to incomplete OpenToL coverage by trapping raxml-ng errors and then re-running without the constraint.
rule run_raxml:
    input:
        alignment = "results/fasta/family/{scatteritem}/raxml-ready.fa",
        tree = "results/fasta/family/{scatteritem}/raxml-ready.tre"
    output:
        tree = "results/fasta/family/{scatteritem}/raxml-ready.fa.raxml.bestTree"
    params:
        model = config['model'],
        num_outgroups= config['outgroups']
    log: "logs/run_raxml-{scatteritem}.log"
    conda: "envs/raxml.yml"
    shell:
        """
        OG=$(grep '>' {input.alignment} | tail -{params.num_outgroups} | sed -e 's/>//' | tr '\n' ',')
        if [ -s {input.tree} ]; then
          set -e
          raxml-ng \
            --msa {input.alignment} \
            --outgroup $OG \
            --model {params.model} \
            --tree-constraint {input.tree} \
            --search > {log} 2>&1 \
          || \
          raxml-ng \
            --msa {input.alignment} \
            --outgroup $OG \
            --model {params.model} \
            --search >> {log} 2>&1
        else
          raxml-ng --msa {input.alignment} --outgroup $OG --model {params.model} --search > {log} 2>&1
        fi
        """


rule reroot_raxml_output:
    input:
        tree = "results/fasta/family/{scatteritem}/raxml-ready.fa.raxml.bestTree",
        constraint = "results/fasta/family/{scatteritem}/raxml-ready.tre"
    output:
        outtree = "results/fasta/family/{scatteritem}/raxml-ready.fa.raxml.bestTree.rooted"
    params:
        log_level = config['log_level']
    log: "logs/reroot_raxml_output-{scatteritem}.log"
    conda: "envs/reroot_backbone.yml"
    shell:
        """
        python workflow/scripts/reroot_backbone.py \
            -i {input.tree} \
            -c {input.constraint} \
            -o {output.outtree} \
            -v {params.log_level} 2> {log}
        """

# TODO choose tall exemplars, not shallow ones
rule choose_exemplars:
    input:
        alignment = "results/fasta/family/{scatteritem}/aligned.fa",
        tree= "results/fasta/family/{scatteritem}/raxml-ready.fa.raxml.bestTree.rooted"
    output: "results/fasta/family/{scatteritem}/exemplars.fa"
    params:
        db="results/databases/BOLD_{}_barcodes.db".format(config["marker"]),
        log_level = config['log_level']
    log: "logs/choose_exemplars-{scatteritem}.log"
    conda: "envs/choose_exemplars.yaml"
    shell:
        """
        python workflow/scripts/choose_exemplars.py \
            -d {params.db} \
            -v {params.log_level} \
            -t {input.tree} \
            -i {input.alignment} \
            -o {output} 2> {log}
        """


# gathers the results
rule prep_raxml_backbone:
    input:
        fastas=gather.split("results/fasta/family/{scatteritem}/exemplars.fa")
    output:
        fasta="results/fasta/raxml-ready.fa",
        tree="results/fasta/raxml-ready.tre"
    params:
        db="results/databases/BOLD_{}_barcodes.db".format(config["marker"]),
        log_level = config['log_level']
    log: "logs/prep_raxml_backbone.log"
    conda: "envs/prep_raxml_backbone.yml"
    shell:
        """
        cat {input.fastas} > {output.fasta}
        python workflow/scripts/backbone_constraint.py \
            -d {params.db} \
            -v {params.log_level} \
            -i '{input.fastas}' \
            -o {output.tree} 2> {log}
        """


rule run_raxml_backbone:
    input:
        alignment = "results/fasta/raxml-ready.fa",
        tree = "results/fasta/raxml-ready.tre"
    output:
        tree = "results/fasta/raxml-ready.fa.raxml.bestTree"
    params:
        model = config['model']
    log: "logs/run_raxml_backbone.log"
    conda: "envs/raxml.yml"
    shell:
        """
          raxml-ng \
            --msa {input.alignment} \
            --model {params.model} \
            --tree-constraint {input.tree} \
            --search > {log} 2>&1
        """


rule reroot_backbone:
    input:
        backbone = "results/fasta/raxml-ready.fa.raxml.bestTree",
        constraint = "results/fasta/raxml-ready.tre"
    output:
        tree = "results/fasta/raxml-ready.fa.raxml.bestTree.rooted"
    params:
        log_level = config['log_level']
    log: "logs/reroot_backbone.log"
    conda: "envs/reroot_backbone.yml"
    shell:
        """
        python workflow/scripts/reroot_backbone.py \
            -i {input.backbone} \
            -c {input.constraint} \
            -o {output.tree} \
            -v {params.log_level} 2> {log}
        """


rule graft_clades:
    input:
        backbone = "results/fasta/raxml-ready.fa.raxml.bestTree.rooted",
        clades = config['file_names']['fasta_dir']
    output:
        tree = "results/grafted.tre"
    params:
        log_level = config['log_level'],
        nfamilies = config["nfamilies"]
    log: "logs/graft_clades.log"
    conda: "envs/graft_clades.yml"
    shell:
        """
        python workflow/scripts/graft_clades.py \
            -t {input.backbone} \
            -f {input.clades} \
            -o {output.tree} \
            -n {params.nfamilies} \
            -v {params.log_level} 2> {log}                  
        """
