Source code for xyalign.assemble

# Part of XYalign
# Collection of functions for mapping reads, processing bams, etc.
from __future__ import print_function
import logging
import os
import subprocess
import time


# Create logger for assemble submodule
assemble_logger = logging.getLogger("xyalign.assemble")


[docs]def bwa_mem_mapping_sambamba( bwa_path, samtools_path, sambamba_path, reference, output_prefix, fastqs, threads, read_group_line, bwa_params, cram=False): """ Maps reads to a reference genome using bwa mem. If output is in bam format, will sort using sambamba, else will sort with samtools Parameters ---------- bwa_path : str The path to bwa samtools_path : str The path to samtools sambamba_path : str The path to sambamba reference : reftools.RefFasta() object reftools.RefFasta() object of reference genome (in fasta format) output_prefix : str The prefix (including path) to the desired output files fastqs : list Fastqs, e.g. ['sample_1.fastq', 'sample_2.fastq'] threads : int The number of threads/cpus to use read_group_line : str Read group info for bwa to add. If 'None', will not add read group. bwa_params : list Bwa parameters to be joined into a string and inserted into the command cram : bool If True, will output a sorted cram, else a sorted bam. Default is False. Returns ------- str Path to output bam file (indexed) Raises ------ RuntimeError If fastq or reference files cannot be found """ map_start = time.time() fastq_status = [os.path.exists(x) for x in fastqs] if all(fastq_status) is False: assemble_logger.error( "One or more fastq files cannot be found. Check paths.") raise RuntimeError("One or more fastq files cannot be found. Check paths.") if os.path.exists(reference.filepath) is False: assemble_logger.error("Reference file cannot be found. Check path.") raise RuntimeError("Reference file cannot be found. Check path.") if len(fastqs) == 2: single = False elif len(fastqs) == 1: single = True else: assemble_logger.error( "{} fastq files provided. Please provide only one or two".format( len(fastqs))) raise RuntimeError( "{} fastq files provided. Please provide only one or two".format( len(fastqs))) fastqs = ' '.join(fastqs) assemble_logger.info( "Beginning steps mapping fastqs ({}) to reference ({}) " "using bwa_mem_mapping_sambamba".format( fastqs, reference.filepath)) # Check that bwa index is not newer than reference (and re-index if it is) reference.conditional_index_bwa() # Check that .fai is not newer than reference (and re-index if it is) if reference.is_faidxed() is False: reference.index_fai() # Check that seq dict is not newer than reference (and re-index if it is) reference.conditional_seq_dict # BAM mapping if cram is False: output_file = "{}.sorted.bam".format(output_prefix) if single is False: if read_group_line != "None": command_line = "{} mem -t {} -R {} {} {} {} | {} fixmate -O bam - - | "\ "{} sort -t {} -o {} /dev/stdin".format( bwa_path, threads, repr(read_group_line), " ".join(bwa_params), reference.filepath, fastqs, samtools_path, sambamba_path, threads, output_file) else: command_line = "{} mem -t {} {} {} {} | {} fixmate -O bam - - | "\ "{} sort -t {} -o {} /dev/stdin".format( bwa_path, threads, " ".join(bwa_params), reference.filepath, fastqs, samtools_path, sambamba_path, threads, output_file) else: if read_group_line != "None": command_line = "{} mem -t {} -R {} {} {} {} | {} view -hb - | "\ "{} sort -t {} -o {} /dev/stdin".format( bwa_path, threads, repr(read_group_line), " ".join(bwa_params), reference.filepath, fastqs, samtools_path, sambamba_path, threads, output_file) else: command_line = "{} mem -t {} {} {} {} | {} view -hb - | "\ "{} sort -t {} -o {} /dev/stdin".format( bwa_path, threads, " ".join(bwa_params), reference.filepath, fastqs, samtools_path, sambamba_path, threads, output_file) assemble_logger.info( "Mapping reads with the command: {}".format(command_line)) subprocess.call(command_line, shell=True) assemble_logger.info( "Indexing {}".format(output_file)) subprocess.call( [sambamba_path, "index", "-t", str(threads), "{}".format(output_file)]) assemble_logger.info( "Completed mapping for fastqs ({}) to reference ({}). " "Elapsed time: {} seconds".format( fastqs, reference.filepath, time.time() - map_start)) return output_file # CRAM mapping # - note, doesn't currently support sambamba sorting (samtools instead) else: output_file = "{}.sorted.cram".format(output_prefix) if read_group_line != "None": command_line = "{} mem -t {} -R {} {} {} {} | {} fixmate -O cram - - | "\ "{} sort -O cram -o {} -".format( bwa_path, threads, repr(read_group_line), " ".join(bwa_params), reference, fastqs, samtools_path, samtools_path, output_file) else: command_line = "{} mem -t {} {} {} {} | {} fixmate -O cram - - | "\ "{} sort -O cram -o {} -".format( bwa_path, threads, " ".join(bwa_params), reference, fastqs, samtools_path, samtools_path, output_file) assemble_logger.info( "Mapping reads with the command: {}".format(command_line)) subprocess.call(command_line, shell=True) assemble_logger.info( "Indexing {}".format(output_file)) subprocess.call( [samtools_path, "index", "{}".format(output_file)]) assemble_logger.info( "Completed mapping for fastqs ({}) to reference ({}). " "Elapsed time: {} seconds".format( fastqs, reference, time.time() - map_start)) return output_file