Source code for xyalign.xyalign

# XYalign main program
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import argparse
import collections
import csv
import logging
import os
import subprocess
import sys
import time
import pandas as pd
import xyalign.assemble as assemble
import xyalign.bam as bam
import xyalign.ploidy as ploidy
import xyalign.reftools as reftools
import xyalign.utils as utils
import xyalign.variants as variants
# from xyalign import assemble, bam, ploidy, reftools, utils, variants

# Grab XYalign version from _version.py in the xyalign directory
dir = os.path.dirname(__file__)
version_py = os.path.join(dir, "_version.py")
exec(open(version_py).read())


[docs]def parse_args(): """ Parse command-line arguments Returns ------- Parser argument namespace """ parser = argparse.ArgumentParser(description="XYalign") parser.add_argument( "--bam", nargs="*", help="Full path to input bam files. If more " "than one provided, only the first will be used for modules other " "than --CHROM_STATS") parser.add_argument( "--cram", nargs="*", help="Full path to input cram files. If more " "than one provided, only the first will be used for modules other " "than --CHROM_STATS. Not currently supported.") parser.add_argument( "--sam", nargs="*", help="Full path to input sam files. If more " "than one provided, only the first will be used for modules other " "than --CHROM_STATS. Not currently supported.") parser.add_argument( "--ref", required=True, help="REQUIRED. Path to reference sequence (including file name).") parser.add_argument( "--output_dir", "-o", required=True, help="REQUIRED. Output directory. XYalign will create a directory " "structure within this directory") parser.add_argument( "--chromosomes", "-c", nargs="*", default=[None], help="Chromosomes to analyze (names must match reference exactly). " "For humans, we recommend at least chr19, chrX, chrY. Generally, we " "suggest including the sex chromosomes and at least one autosome. " "To analyze all chromosomes use '--chromosomes ALL' or " "'--chromosomes all'.") parser.add_argument( "--x_chromosome", "-x", nargs="*", default=[None], help="Names of x-linked scaffolds in reference fasta (must match " "reference exactly). ") parser.add_argument( "--y_chromosome", "-y", nargs="*", default=[None], help="Names of y-linked scaffolds in reference fasta (must match " "reference exactly). Defaults to chrY. Give None if using an assembly " "without a Y chromosome") parser.add_argument( "--sample_id", "-id", default="sample", help="Name/ID of sample - for use in plot titles and file naming. " "Default is sample") parser.add_argument( "--cpus", type=int, default=1, help="Number of cores/threads to use. Default is 1") parser.add_argument( "--xmx", default="None", help="Memory to be provided to java programs via -Xmx. E.g., use the " "flag '--xmx 4g' to pass '-Xmx4g' as a flag when running java " "programs (currently just repair.sh). Default is 'None' (i.e., nothing " "provided on the command line), which will allow repair.sh to " "automatically allocate memory. Note that if you're using --STRIP_READS " "on deep coverage whole genome data, you might need quite a bit of memory, " "e.g. '--xmx 16g', '--xmx 32g', or more depending on how many reads " "are present per read group.") parser.add_argument( "--fastq_compression", default=3, type=int, choices=range(0, 10), help="Compression level for fastqs output from repair.sh. Between " "(inclusive) 0 and 9. Default is 3. 1 through 9 indicate " "compression levels. If 0, fastqs will be uncompressed.") parser.add_argument( "--single_end", action="store_true", default=False, help="Include flag if reads are single-end and NOT paired-end.") parser.add_argument( "--version", "-V", action="version", version="XYalign {}".format(__version__), help="Print version and exit.") parser.add_argument( "--no_cleanup", action="store_true", default=False, help="Include flag to preserve temporary files.") # Options to run specific parts of the pipeline pipeline_group = parser.add_mutually_exclusive_group(required=False) pipeline_group.add_argument( "--PREPARE_REFERENCE", action="store_true", default=False, help="This flag will limit XYalign to only preparing reference fastas " "for individuals with and without Y chromosomes. These fastas can " "then be passed with each sample to save subsequent processing time.") pipeline_group.add_argument( "--CHROM_STATS", action="store_true", default=False, help="This flag will limit XYalign to only analyzing provided bam files " "for depth and mapq across entire chromosomes.") pipeline_group.add_argument( "--ANALYZE_BAM", action="store_true", default=False, help="This flag will limit XYalign to only analyzing the bam file for " "depth, mapq, and (optionally) read balance and outputting plots.") pipeline_group.add_argument( "--CHARACTERIZE_SEX_CHROMS", action="store_true", default=False, help="This flag will limit XYalign to the steps required to " "characterize sex chromosome content (i.e., analyzing the bam " "for depth, mapq, and read balance and running statistical tests " "to help infer ploidy)") pipeline_group.add_argument( "--REMAPPING", action="store_true", default=False, help="This flag will limit XYalign to only the steps required to " "strip reads and remap to masked references. If masked references " "are not provided, they will be created.") pipeline_group.add_argument( "--STRIP_READS", action="store_true", default=False, help="This flag will limit XYalign to only the steps required to " "strip reads from a provided bam file.") # Logging options parser.add_argument( "--logfile", default=None, help="Name of logfile. Will overwrite if exists. Default is " "sample_xyalign.log") parser.add_argument( "--reporting_level", default='INFO', choices=["DEBUG", "INFO", "ERROR", "CRITICAL"], help="Set level of messages printed to console. Default is 'INFO'. " "Choose from (in decreasing amount of reporting) DEBUG, INFO, ERROR " "or CRITICAL") # Program paths parser.add_argument( "--platypus_path", default="platypus", help="Path to platypus. Default is 'platypus'. If platypus is not " "directly callable (e.g., '/path/to/platypus' or " "'/path/to/Playpus.py'), then provide path to python as well (e.g., " "'/path/to/python /path/to/platypus'). In addition, be sure provided " "python is version 2. See the documentation for more information " "about setting up an anaconda environment.") parser.add_argument( "--bwa_path", default="bwa", help="Path to bwa. Default is 'bwa'") parser.add_argument( "--samtools_path", default="samtools", help="Path to samtools. Default is 'samtools'") parser.add_argument( "--repairsh_path", default="repair.sh", help="Path to bbmap's repair.sh script. Default is 'repair.sh'") parser.add_argument( "--shufflesh_path", default="repair.sh", help="Path to bbmap's shuffle.sh script. Default is 'shuffle.sh'") parser.add_argument( "--sambamba_path", default="sambamba", help="Path to sambamba. Default is 'sambamba'") parser.add_argument( "--bedtools_path", default="bedtools", help="Path to bedtools. Default is 'bedtools'") # Options for turning on/off parts of the pipeline parser.add_argument( "--platypus_calling", default="both", choices=["both", "none", "before", "after"], help="Platypus calling withing the pipeline " "(before processing, after processing, both, " "or neither). Options: both, none, before, after.") parser.add_argument( "--no_variant_plots", action="store_true", default=False, help="Include flag to prevent plotting read balance from VCF files.") parser.add_argument( "--no_bam_analysis", action="store_true", default=False, help="Include flag to prevent depth/mapq analysis of bam file. " "Used to isolate platypus_calling.") parser.add_argument( "--skip_compatibility_check", action="store_true", default=False, help="Include flag to prevent check of compatibility between input bam " "and reference fasta") parser.add_argument( "--no_perm_test", action="store_true", default=False, help="Include flag to turn off permutation tests.") parser.add_argument( "--no_ks_test", action="store_true", default=False, help="Include flag to turn off KS Two Sample tests.") parser.add_argument( "--no_bootstrap", action="store_true", default=False, help="Include flag to turn off bootstrap analyses. Requires either " "--y_present, --y_absent, or --sex_chrom_calling_threshold " "if running full pipeline.") # Variant Flags parser.add_argument( "--variant_site_quality", "-vsq", type=int, default=30, help="Consider all SNPs with a site quality (QUAL) greater than or " "equal to this value. Default is 30.") parser.add_argument( "--variant_genotype_quality", "-vgq", type=int, default=30, help="Consider all SNPs with a sample genotype quality greater than or " "equal to this value. Default is 30.") parser.add_argument( "--variant_depth", "-vd", type=int, default=4, help="Consider all SNPs with a sample depth greater than or " "equal to this value. Default is 4.") parser.add_argument( "--platypus_logfile", default=None, help="Prefix to use for Platypus log files. Will default to the " "sample_id argument provided") parser.add_argument( "--homogenize_read_balance", type=bool, default=False, help="If True, read balance values will be transformed by subtracting " "each value from 1. For example, 0.25 and 0.75 would be treated " "equivalently. Default is False.") parser.add_argument( "--min_variant_count", type=int, default=1, help="Minimum number of variants in a window for the read balance of " "that window to be plotted. Note that this does not affect plotting " "of variant counts. Default is 1, though we note that many window " "averages will be meaningless at this setting.") # Reference-related Flags parser.add_argument( "--reference_mask", nargs="*", help="Bed file containing regions to replace with Ns in the sex " "chromosome reference. Examples might include the pseudoautosomal " "regions on the Y to force all mapping/calling on those regions of the " "X chromosome. Default is None.") parser.add_argument( "--xx_ref_out_name", default=None, help="Desired name for masked output fasta for " "samples WITHOUT a Y chromosome (e.g., XX, XXX, XO, etc.). " "Defaults to 'xyalign_noY.masked.fa'. Will be output " "in the XYalign reference directory.") parser.add_argument( "--xy_ref_out_name", default=None, help="Desired name for masked output fasta for " "samples WITH a Y chromosome (e.g., XY, XXY, etc.). " "Defaults to 'xyalign_withY.masked.fa'. Will be output " "in the XYalign reference directory.") parser.add_argument( "--xx_ref_out", default=None, help="Desired path to and name of masked output fasta for " "samples WITHOUT a Y chromosome (e.g., XX, XXX, XO, etc.). " "Overwrites if exists. " "Use if you would like output somewhere other than XYalign reference " "directory. Otherwise, use --xx_ref_name.") parser.add_argument( "--xy_ref_out", default=None, help="Desired path to and name of masked output fasta for " "samples WITH a Y chromosome (e.g., XY, XXY, etc.). Overwrites if exists. " "Use if you would like output somewhere other than XYalign reference " "directory. Otherwise, use --xy_ref_name.") parser.add_argument( "--xx_ref_in", default=None, help="Path to preprocessed reference fasta to be used for remapping in " "X0 or XX samples. Default is None. If none, will produce a " "sample-specific reference for remapping.") parser.add_argument( "--xy_ref_in", default=None, help="Path to preprocessed reference fasta to be used for remapping in " "samples containing Y chromosome. Default is None. If none, will " "produce a sample-specific reference for remapping.") parser.add_argument( "--bwa_index", type=bool, default=False, help="If True, index with BWA during PREPARE_REFERENCE. Only relevant" "when running the PREPARE_REFERENCE module by itself. Default is False.") # Mapping/remapping arguments parser.add_argument( "--read_group_id", default="xyalign", type=str, help="If read groups are present in a bam file, they are used by default in " "remapping steps. However, if read groups are not present in a file, " "there are two options for proceeding. If '--read_group_id None' is " "provided (case sensitive), then no read groups will be used in " "subsequent mapping steps. Otherwise, any other string provided to " "this flag will be used as a read group ID. " "Default is '--read_group_id xyalign'") parser.add_argument( "--bwa_flags", type=str, default="", help="Provide a string (in quotes, with spaces between arguments) " "for additional flags desired for BWA mapping (other than -R and -t). " "Example: '-M -T 20 -v 4'. Note that those are spaces between " "arguments.") parser.add_argument( "--sex_chrom_bam_only", action="store_true", default=False, help="This flag skips merging the new sex chromosome bam file back " "into the original bam file (i.e., sex chrom swapping). This will " "output a bam file containing only the newly remapped sex chromosomes.") parser.add_argument( "--sex_chrom_calling_threshold", type=float, default=2.0, help="This is the *maximum* filtered X/Y depth ratio for an individual " "to be considered as having heterogametic sex chromsomes (e.g., XY) for " "the REMAPPING module of XYalign. " "Note here that X and Y chromosomes are simply the chromosomes that " "have been designated as X and Y via --x_chromosome and --y_chromosome. " "Keep in mind that the ideal threshold will vary according to sex " "determination mechanism, sequence homology between the sex chromosomes, " "reference genome, sequencing methods, etc. See documentation for more " "detail. Default is 2.0, which we found to be reasonable for exome, " "low-coverage whole-genome, and high-coverage whole-genome human data.") group_y = parser.add_mutually_exclusive_group(required=False) group_y.add_argument( "--y_present", action="store_true", default=False, help="Overrides sex chr estimation by XYalign and remaps with Y present.") group_y.add_argument( "--y_absent", action="store_true", default=False, help="Overrides sex chr estimation by XY align and remaps with Y absent.") # Bam Analysis and Characterize Sex Chrom Flags parser.add_argument( "--window_size", "-w", default=None, help="Window size (integer) for sliding window calculations. Default " "is 50000. Default is None. If set to None, will use targets " "provided using --target_bed.") parser.add_argument( "--target_bed", default=None, help="Bed file containing targets to use in sliding window analyses " "instead of a fixed window width. Either this or --window_size needs " "to be set. Default is None, which will use window size provided " "with --window_size. If not None, and --window_size is None, analyses " "will use targets in provided file. Must be typical bed format, " "0-based indexing, with the first three columns containing " "the chromosome name, start coordinate, stop coordinate.") parser.add_argument( "--exact_depth", action="store_true", default=False, help="Calculate exact depth within windows, else use much faster " "approximation. *Currently exact is not implemented*. Default is False.") parser.add_argument( "--whole_genome_threshold", action="store_true", default=False, help="This flag will calculate the depth filter threshold based on " "all values from across the genome. By default, thresholds are " "calculated per chromosome.") parser.add_argument( "--mapq_cutoff", "-mq", type=int, default=20, help="Minimum mean mapq threshold for a window to be " "considered high quality. Default is 20.") parser.add_argument( "--min_depth_filter", type=float, default=0.0, help="Minimum depth threshold for a window to be considered high " "quality. Calculated as mean depth * min_depth_filter. So, a " "min_depth_filter of 0.2 would require at least a minimum depth " "of 2 if the mean depth was 10. Default is 0.0 to consider all windows.") parser.add_argument( "--max_depth_filter", type=float, default=10000.0, help="Maximum depth threshold for a window to be considered high " "quality. Calculated as mean depth * max_depth_filter. So, a " "max_depth_filter of 4 would require depths to be less than or " "equal to 40 if the mean depth was 10. " "Default is 10000.0 to consider all windows.") parser.add_argument( "--num_permutations", type=int, default=10000, help="Number of permutations to use for permutation analyses. " "Default is 10000") parser.add_argument( "--num_bootstraps", type=int, default=10000, help="Number of bootstrap replicates to use when bootstrapping mean " "depth ratios among chromosomes. Default is 10000") parser.add_argument( "--ignore_duplicates", action="store_true", default=False, help="Ignore duplicate reads in bam analyses. Default is to include " "duplicates.") # Plotting flags parser.add_argument( "--marker_size", type=float, default=10.0, help="Marker size for genome-wide plots in matplotlib. Default is 10.") parser.add_argument( "--marker_transparency", "-mt", type=float, default=0.5, help="Transparency of markers in genome-wide plots. " "Alpha in matplotlib. Default is 0.5") parser.add_argument( "--coordinate_scale", type=int, default=1000000, help="For genome-wide scatter plots, divide all coordinates by this value." "Default is 1000000, which will plot everything in megabases.") parser.add_argument( "--include_fixed", type=bool, default=False, help="Default is False, which removes read balances less than or equal to " "0.05 and equal to 1.0 for histogram plotting. True will include " "all values. Extreme values removed by default because they often swamp " "out the signal of the rest of the distribution.") # CHROM_STATS flags parser.add_argument( "--use_counts", action="store_true", default=False, help="If True, get counts of reads per chromosome for CHROM_STATS, rather " "than calculating mean depth and mapq. Much faster, but provides less " "information. Default is False") args = parser.parse_args() # Validate Arguments mods = [ args.PREPARE_REFERENCE, args.ANALYZE_BAM, args.CHARACTERIZE_SEX_CHROMS, args.REMAPPING, args.STRIP_READS, args.CHROM_STATS] if not any(mods) is True: full_pipeline = True else: full_pipeline = False # Ensure only one of bam, sam, or cram is set bam_flags = [args.bam, args.cram, args.sam] bam_flags = [x for x in bam_flags if x is not None] if len(bam_flags) > 1: sys.exit( "Error. --bam, --cram, and --sam are mutally exclusive, please " "provide no more than one. Submitted values are: --bam {} " "--cram {} --sam {}".format(args.bam, args.cram, args.sam)) elif len(bam_flags) == 0: if args.PREPARE_REFERENCE is not True: sys.exit( "Error. All modules other than PREPARE_REFERENCE require an input " "bam file.") # Cram and SAM files are currently unsupported if args.cram is not None: sys.exit( "Error. XYalign does not currently support cram files. " "Please provide a bam file via --bam instead.") if args.sam is not None: sys.exit( "Error. XYalign does not currently support sam files. " "Please provide a bam file via --bam instead.") # Exact depth is not currently implemented if args.exact_depth is True: sys.exit( "Error. Exact depth is not currently implemented. Exiting.") # Validate ploidy test arguments test_flags = [args.no_perm_test, args.no_bootstrap, args.no_ks_test] if full_pipeline is True: if args.no_bootstrap is True: if args.y_present is False and args.y_absent is False: sys.exit( "Error. Either --y_present or --y_absent needs to be " "included with if running the full pipeline " "with --no_bootstrap. XYalign uses the results of the " "bootstrap for sex chromosome complement inference.") if args.REMAPPING is True: if any([args.y_present, args.y_absent]) is False: sys.exit( "Error. Either --y_present or --y_absent needs to be " "included with if runnning --REMAPPING. If you are " "interested in both sex chromosome complement inference " "and remapping based on the results of said inference, " "consider running the full pipeline.") # Validate chromosome arguments if any( [full_pipeline, args.ANALYZE_BAM, args.CHARACTERIZE_SEX_CHROMS, args.STRIP_READS, args.CHROM_STATS]) is True: if args.chromosomes == [None]: sys.exit("Please provide chromosome names to analyze (--chromosomes)") if any( [full_pipeline, args.CHARACTERIZE_SEX_CHROMS]) is True: if len(args.chromosomes) < 2: sys.exit( "You only provided a single chromosome to analyze with " "--chromosomes. CHARACTERIZE_SEX_CHROMS requires at least two " "chromosomes, including one designated as an X chromosome.") if any( [full_pipeline, args.CHARACTERIZE_SEX_CHROMS, args.REMAPPING]) is True: if args.x_chromosome == [None]: sys.exit( "Please provide an 'X chromosome' to analyze (--x_chromosome). " "Note that this does not have to actually be an x-linked sequence, " "but this designation is required for the tests in " "CHARACTERIZE_SEX_CHROMS, which involve pairwise comparisons " "between the 'X chromosome', the 'Y chromosome', and all other " "'autosomes' (see documentation for more details).") if any( [full_pipeline, args.PREPARE_REFERENCE, args.REMAPPING]) is True: if args.y_chromosome == [None]: sys.exit( "Please provide an 'Y chromosome' for the full pipeline, " "--PREPARE_REFERENCE, or --REMAPPING. It is required for the " "creation and use of an alternate reference (e.g., " "masking the Y chromosome for the XX reference)") # Validate bwa arguments bwa_args = [str(x).strip() for x in args.bwa_flags.split()] red_list = ["-rm", "rm", "-rf", "rf", "-RM", "RM", "-RF", "RF"] if any(x in bwa_args for x in red_list): sys.exit( "Found either rm or rf in your bwa flags. Exiting to prevent " "unintended shell consequences") yellow_list = ["-R", "-t"] if any(x in bwa_args for x in yellow_list): sys.exit( "Found either -R or -t in bwa flags. These flags are already used " "in XYalign. Please remove.") # Validate sliding window options if any( [full_pipeline, args.ANALYZE_BAM, args.CHARACTERIZE_SEX_CHROMS]) is True: if args.window_size is not None and args.window_size != "None": if args.window_size.isdigit() is False: sys.exit( "--window_size needs to be either None or a positive integer. " "Exiting.") else: if args.target_bed is None: sys.exit( "If --window_size is None, --target_bed needs to be used. " "Please set one of these two flags if running ANALYZE_BAM, " "CHARACTERIZE_SEX_CHROMS, or the full pipeline. Exiting.") elif os.path.exists(args.target_bed) is False: sys.exit( "Invalid file provided with --target_bed. Check path. Exiting.") # Return arguments namespace return args
[docs]def ref_prep( ref_obj, ref_mask, ref_dir, xx, xy, y_chromosome, samtools_path, bwa_path, bwa_index): """ Reference prep part of XYalign pipeline. * Creates two reference fasta files. Both will include masks provied with ref_mask. One will additionally have the entire Y chromosome hard masked. * Indexes (.fai, .dict, and optionally bwa indices) both new references Parameters ---------- ref_obj : RefFasta() object A reftools.RefFasta() object of a fasta reference file to be processed ref_mask : list or None List of files to use to *hard-mask* references. None will ignore masking. ref_dir : str Path to output directory xx : str Path to XX output reference xy : str Path to XY output reference y_chromosome : str Name of Y chromosome in fasta samtools_path : str The path to samtools (i.e, "samtools" if in path) bwa_path : str The path to bwa (i.e, "bwa" if in path) bwa_index : bool If True, create bwa indices. Don't if False. Returns ------- tuple Paths to two masked references (y_masked, y_unmasked) """ logger = logging.getLogger("xyalign") # Combine masks, if more than one present if ref_mask is not None: if len(ref_mask) > 1: reference_mask = utils.merge_bed_files( "{}/reference_mask.merged.bed".format( ref_dir), *ref_mask) else: reference_mask = ref_mask[0] else: reference_mask = None # Create masked noY reference y_mask = ref_obj.chromosome_bed("{}/Y.bed".format( ref_dir), y_chromosome) if reference_mask is not None: noy_out = ref_obj.mask_reference( utils.merge_bed_files( "{}/reference_mask.maskY.merged.bed".format( ref_dir), reference_mask, y_mask), xx) else: noy_out = ref_obj.mask_reference(y_mask, xx) noy_ref = reftools.RefFasta(noy_out, samtools_path, bwa_path) noy_ref.seq_dict() # Create masked withY reference if reference_mask is not None: withy_out = ref_obj.mask_reference(reference_mask, xy) else: logger.info( "No reference mask provided, copying full reference to {} as " "XY reference to prevent damage, modification, etc. to original " "reference.".format(xy)) subprocess.call(["cp", ref_obj.filepath, xy]) withy_out = xy withy_ref = reftools.RefFasta(withy_out, samtools_path, bwa_path) withy_ref.seq_dict() if bwa_index is True: noy_ref.index_bwa() withy_ref.index_bwa() return (noy_ref, withy_ref)
[docs]def chrom_stats(bam_obj_list, chrom_list, use_counts): """ Runs chrom stats module. Calculates mean depth and mapq across entire scaffolds for a list of bam files Input ----- bam_obj_list : list List of bam.BamFile() objects. Need to have been created using same reference (i.e., seq names and lengths are the same) chrom_list : list List of chromosome names to analyze use_counts : bool If True, will just grab counts from bam index INSTEAD of traversing for depth and mapq Returns ------- tuple Tuple containing two dictionaries with results for depth and mapq, respectively. Or, if use_counts is True, returns a tuple containing the count dictionary and None. """ logger = logging.getLogger("xyalign") logger.info( "Running CHROM_STATS on following bam files: {}".format( ", ".join([x.filepath for x in bam_obj_list]))) comp_check = utils.check_compatibility_bam_list(bam_obj_list) if comp_check is False: logger.error( "Bam files incompatible - ensure same reference used for all files " "and that sequence portion of all bam headers match. Exiting") logging.shutdown() sys.exit(1) if chrom_list == ["ALL"] or chrom_list == ["all"]: chrom_list = list(bam_obj_list[0].chromosome_names()) missing_chroms = bam_obj_list[0].check_chrom_in_bam(chrom_list) if len(missing_chroms) != 0: logger.error( "One or more chromosomes provided via --chromosomes not " "present in bam files. Exiting.") logging.shutdown() sys.exit(1) if use_counts is False: chrom_depth_dict = collections.OrderedDict() chrom_mapq_dict = collections.OrderedDict() chrom_depth_dict["header"] = [ os.path.basename(x.filepath) for x in bam_obj_list] chrom_depth_dict["header"].insert(0, "chrom") chrom_mapq_dict["header"] = [ os.path.basename(x.filepath) for x in bam_obj_list] chrom_mapq_dict["header"].insert(0, "chrom") for chromosome in chrom_list: chrom_results = [ x.chrom_stats(chromosome, False) for x in bam_obj_list] chrom_depth_dict[chromosome], chrom_mapq_dict[chromosome] = zip( *chrom_results) chrom_depth_dict[chromosome] = (chromosome,) + chrom_depth_dict[chromosome] chrom_mapq_dict[chromosome] = (chromosome,) + chrom_mapq_dict[chromosome] return (chrom_depth_dict, chrom_mapq_dict) else: chrom_count_dict = collections.OrderedDict() chrom_count_dict["header"] = [ os.path.basename(x.filepath) for x in bam_obj_list] chrom_count_dict["header"].insert(0, "chrom") for chromosome in chrom_list: chrom_count_dict[chromosome] = [chromosome] for i in bam_obj_list: idx_stats = i.chrom_counts() for k in idx_stats: if k.contig in chrom_count_dict: chrom_count_dict[k.contig].append(int(k.mapped)) return (chrom_count_dict, None)
[docs]def bam_analysis( input_bam_obj, platypus_calling, platypus_path, vcf_log, ref_obj, input_chroms, cpus, out_vcf, no_variant_plots, window_size, target_bed, sample_id, readbalance_prefix, variant_site_quality, variant_genotype_quality, variant_depth, marker_size, marker_transparency, homogenize_read_balance, data_frame_readbalance, min_variant_count, no_bam_analysis, ignore_duplicates, exact_depth, whole_genome_threshold, mapq_cutoff, min_depth_filter, max_depth_filter, depth_mapq_prefix, bam_data_frame, output_bed_high, output_bed_low, use_bed_for_platypus, coordinate_scale, fixed): """ Runs bam analyis part of XYalign pipeline on bam file. * (Optionally) calls variants using Platypus * (Optionally) parses and filters Platypus vcf, and plots read balance * (Optionally) Calculates window based metrics from the bam file: depth and mapq * (optionally) Plots window-based metrics * Outputs two bed files: high quality windows, and low quality windows. Parameters ---------- input_bam_obj : bam.BamFile() object platypus_calling : bool If True, will call and analyze variants platypus_path : str Command to call platypus (e.g, "platypus") vcf_log : str Path to file for platypus log ref_obj : reftools.RefFasta() object input_chroms : list Chromosomes to analyze cpus : int Number of threads/cpus out_vcf : str Output vcf path/name no_variant_plots : bool If True, will not plot read balance window_size : int or None Window size for sliding window analyses (both bam and vcf). If None, will use regions in target_bed target_bed : str or None Path to bed file containing targets to use in sliding window analyses sample_id : str readbalance_prefix : str Prefix, including full path, to use for output files for readbalance analyses variant_site_quality : int Minimum site quality (PHRED) for a site to be included in readbalance analyses variant_genotype_quality : int Minimum genotype quality for a site to be included in read balance analyses variant_depth : int Minimum depth for a site to be included in read balance analyses marker_size : float Marker size for plotting genome scatter plots marker_transparency: float Value to use for marker transparency in genome scatter plots homogenize_read_balance : bool If true, will subtract values less than 0.5 from 1. I.e., 0.25 and 0.75 would be treated equivalently data_frame_readbalance: str Path of output file for full read balance dataframe min_variant_count : int Minimum number of variants in a given window for the window to be plotted in window-based read balance analyses no_bam_analysis : bool If True, no bam analyses will take place ignore_duplicates : bool If True, duplicates excluded from bam analyses exact_depth : bool If True, exact depth calculated in each window. Else, a much faster approximation will be used whole_genome_threshold : bool If True, values for depth filters will be calculated using mean from across all chromosomes included in analyses. Else, mean will be taken per chromosome min_depth_filter : float Minimum depth threshold for a window to be considered high. Calculated as mean depth * min_depth_filter. max_depth_filter : float Maximum depth threshold for a window to be considered high. Calculated as mean depth * min_depth_filter. depth_mapq_prefix : str Prefix, including full path, to be used for files output from depth and mapq analyses bam_data_frame : str Full path to output file for dataframe containing all data from bam analyses output_bed_high : str Full path to output bed containing high quality (i.e., passing filters) windows output_bed_low : str Full path to output bed containing low quality (i.e., failing filters) windows use_bed_for_platypus : bool If True, use output_bed_high as regions for Platypus calling coordinate_scale : int Divide all coordinates by this value for plotting. In most cases, 1000000 will be ideal for eukaryotic genomes. fixed : bool If False, only plots histogram for values between 0.05 and 1.0 (non-inclusive). If True, plots histogram of all variants. Returns ------- tuple (list of pandas dataframes with passing windows, list of pandas dataframes with failing windows) """ logger = logging.getLogger("xyalign") # Depth/MAPQ if no_bam_analysis is not True: all_df = [] pass_df = [] fail_df = [] for chromosome in input_chroms: if window_size is not None and window_size != "None": data = input_bam_obj.analyze_bam( chromosome, ignore_duplicates, exact_depth, int(window_size)) else: data = input_bam_obj.analyze_bam( chromosome, ignore_duplicates, exact_depth, None, target_bed) if whole_genome_threshold is True: tup = utils.make_region_lists_genome_filters( data, mapq_cutoff, min_depth_filter, max_depth_filter) else: tup = utils.make_region_lists_chromosome_filters( data, mapq_cutoff, min_depth_filter, max_depth_filter) all_df.append(data) pass_df.append(tup[0]) fail_df.append(tup[1]) utils.plot_depth_mapq( data, depth_mapq_prefix, sample_id, input_bam_obj.get_chrom_length(chromosome), marker_size, marker_transparency, coordinate_scale) all_concat = pd.concat(all_df) all_concat.to_csv( bam_data_frame, index=False, sep="\t", quoting=csv.QUOTE_NONE) utils.output_bed(output_bed_high, *pass_df) utils.output_bed(output_bed_low, *fail_df) # Platypus if platypus_calling is True: if use_bed_for_platypus is True: a = input_bam_obj.platypus_caller( platypus_path, vcf_log, ref_obj.filepath, input_chroms, cpus, out_vcf, output_bed_high) else: a = input_bam_obj.platypus_caller( platypus_path, vcf_log, ref_obj.filepath, input_chroms, cpus, out_vcf, None) if a != 0: logger.error("Error in platypus calling on {}".format( input_bam_obj.filepath)) logging.shutdown() sys.exit(1) noprocess_vcf_object = variants.VCFFile(out_vcf) if no_variant_plots is not True: if window_size is not None and window_size != "None": noprocess_vcf_object.plot_variants_per_chrom( chrom_list=input_chroms, sampleID=sample_id, output_prefix=readbalance_prefix, site_qual=variant_site_quality, genotype_qual=variant_genotype_quality, depth=variant_depth, MarkerSize=marker_size, MarkerAlpha=marker_transparency, bamfile_obj=input_bam_obj, variant_caller="platypus", homogenize=homogenize_read_balance, dataframe_out=data_frame_readbalance, min_count=min_variant_count, window_size=int(window_size), x_scale=coordinate_scale, target_file=None, include_fixed=fixed) else: noprocess_vcf_object.plot_variants_per_chrom( chrom_list=input_chroms, sampleID=sample_id, output_prefix=readbalance_prefix, site_qual=variant_site_quality, genotype_qual=variant_genotype_quality, depth=variant_depth, MarkerSize=marker_size, MarkerAlpha=marker_transparency, bamfile_obj=input_bam_obj, variant_caller="platypus", homogenize=homogenize_read_balance, dataframe_out=data_frame_readbalance, min_count=min_variant_count, window_size=None, x_scale=coordinate_scale, target_file=target_bed, include_fixed=fixed) return(pass_df, fail_df)
[docs]def ploidy_analysis( passing_df, failing_df, no_perm_test, no_ks_test, no_bootstrap, input_chroms, x_chromosome, y_chromosome, results_dir, num_permutations, num_bootstraps, sample_id): """ Runs the ploidy analysis part of XYalign. * Runs permutation test to systematically compare means between every possible pair of chromosomes * Runs K-S two sample test to systematically compare distributions between every possible pair of chromosomes * Bootstraps the mean depth ratio for every possible pair of chromosomes Parameters ---------- passing_df : list Passing pandas dataframes, one per chromosome failing_df : list Failing pandas dataframes, one per chromosome no_perm_test : bool If False, permutation test will be run no_ks_test : bool If False, KS test will be run no_bootstrap : bool If False, bootstrap analysis will be run input_chroms : list Chromosomes/scaffolds to analyze x_chromosome : list X-linked scaffolds y_chromosome : list Y-likned scaffolds results_dir : str Full path to directory to output results num_permutations : int Number of permutations num_bootstraps : int Number of bootstrap replicates sample_id : str Returns ------- dictionary Results for each test. Keys: perm, ks, boot. """ # Permutations if no_perm_test is False: if y_chromosome != [None]: sex_chromosomes = x_chromosome + y_chromosome perm_res_x = [] perm_res_y = [] else: sex_chromosomes = x_chromosome perm_res_x = [] perm_res_y = None autosomes = [ x for x in input_chroms if x not in sex_chromosomes] for c in autosomes: perm_res_x.append(ploidy.permutation_test_chromosomes( pd.concat(passing_df), c, str(x_chromosome[0]), "chrom", "depth", num_permutations, results_dir + "/{}.{}_{}_permutation_results.txt".format( sample_id, c, str(x_chromosome[0])))) if perm_res_y is not None: perm_res_y.append(ploidy.permutation_test_chromosomes( pd.concat(passing_df), c, str(y_chromosome[0]), "chrom", "depth", num_permutations, results_dir + "/{}.{}_{}_permutation_results.txt".format( sample_id, c, str(y_chromosome[0])))) if perm_res_y is not None: sex_perm_res = ploidy.permutation_test_chromosomes( pd.concat(passing_df), str(x_chromosome[0]), str(y_chromosome[0]), "chrom", "depth", num_permutations, results_dir + "/{}.{}_{}_permutation_results.txt".format( sample_id, str(x_chromosome[0]), str(y_chromosome[0]))) else: sex_perm_res = None # K-S Two Sample if no_ks_test is False: if y_chromosome != [None]: sex_chromosomes = x_chromosome + y_chromosome ks_res_x = [] ks_res_y = [] else: sex_chromosomes = x_chromosome ks_res_x = [] ks_res_y = None autosomes = [ x for x in input_chroms if x not in sex_chromosomes] for c in autosomes: ks_res_x.append(ploidy.ks_two_sample( pd.concat(passing_df), c, str(x_chromosome[0]), "chrom", "depth", results_dir + "/{}.{}_{}_ks_results.txt".format( sample_id, c, str(x_chromosome[0])))) if ks_res_y is not None: ks_res_y.append(ploidy.ks_two_sample( pd.concat(passing_df), c, str(y_chromosome[0]), "chrom", "depth", results_dir + "/{}.{}_{}_ks_results.txt".format( sample_id, c, str(y_chromosome[0])))) if ks_res_y is not None: sex_ks_res = ploidy.ks_two_sample( pd.concat(passing_df), str(x_chromosome[0]), str(y_chromosome[0]), "chrom", "depth", results_dir + "/{}.{}_{}_ks_results.txt".format( sample_id, str(x_chromosome[0]), str(y_chromosome[0]))) else: sex_ks_res = None # Bootstrap if no_bootstrap is False: if y_chromosome != [None]: sex_chromosomes = x_chromosome + y_chromosome boot_res_x = [] boot_res_y = [] else: sex_chromosomes = x_chromosome boot_res_x = [] boot_res_y = None autosomes = [ x for x in input_chroms if x not in sex_chromosomes] for c in autosomes: boot_res_x.append(ploidy.bootstrap( pd.concat(passing_df), c, str(x_chromosome[0]), "chrom", "depth", num_bootstraps, results_dir + "/{}.{}_{}_bootstrap_results.txt".format( sample_id, c, str(x_chromosome[0])))) if boot_res_y is not None: boot_res_y.append(ploidy.bootstrap( pd.concat(passing_df), c, str(y_chromosome[0]), "chrom", "depth", num_bootstraps, results_dir + "/{}.{}_{}_bootstrap_results.txt".format( sample_id, c, str(y_chromosome[0])))) if boot_res_y is not None: sex_boot_res = ploidy.bootstrap( pd.concat(passing_df), str(x_chromosome[0]), str(y_chromosome[0]), "chrom", "depth", num_bootstraps, results_dir + "/{}.{}_{}_bootstrap_results.txt".format( sample_id, str(x_chromosome[0]), str(y_chromosome[0]))) else: sex_boot_res = None return { "perm": [perm_res_x, perm_res_y, sex_perm_res], "ks": [ks_res_x, ks_res_y, sex_ks_res], "boot": [boot_res_x, boot_res_y, sex_boot_res]}
[docs]def remapping( input_bam_obj, y_pres, masked_references, samtools_path, sambamba_path, repairsh_path, shufflesh_path, bwa_path, bwa_flags, single_end, bam_dir, fastq_dir, sample_id, x_chromosome, y_chromosome, cpus, xmx, fastq_compression, cleanup, read_group_id): """ Runs remapping steps of XYalign. * Strips, sorts, and re-pair reads from the sex chromosomes (collecting read group information) * Maps (with sorting) reads (with read group information) to appropriate reference based on presence (or not) of Y chromosome * Merge bam files (if more than one read group) Parameters ---------- input_bam_obj : bam.BamFile() object y_pres : bool True if Y chromosome present in individual masked_references : tuple Masked reference objects (xx, xy) samtools_path : str Path/command to call samtools sambamba_path : str Path/command to call sambamba repairsh_path : str Path/command to call repair.sh shufflesh_path : str Path/command to call shuffle.sh bwa_path : str Path/command to call bwa bwa_flags : str Flags to use for bwa mapping single_end : bool If True, reads treated as single end bam_dir : str Path to output directory for bam files fastq_dir : str Path to output directory for fastq files sample_id : str x_chromosome : list X-linked scaffolds y_chromosome : list Y-linked scaffolds cpus : int Number of threads/cpus xmx : str Value to be combined with -Xmx for java programs (i.e., 4g would result in -Xmx4g) fastq_compression : int Compression level for fastq files. 0 leaves fastq files uncompressed. Otherwise values should be between 1 and 9 (inclusive), with larger values indicating more compression cleanup : bool If True, will delete temporary files read_group_id : str ID to use to add read group information Returns ------- str Path to bam containing remapped sex chromsomes """ if y_pres is True: new_reference = masked_references[1] else: new_reference = masked_references[0] new_fastqs = input_bam_obj.strip_reads( repairsh_path, shufflesh_path, single_end, fastq_dir, sample_id, x_chromosome + y_chromosome, xmx, fastq_compression, cleanup, read_group_id) with open(new_fastqs[0]) as f: read_group_and_fastqs = [line.strip() for line in f] read_group_and_fastqs = [x.split() for x in read_group_and_fastqs] if new_fastqs[1] is not None: with open(new_fastqs[1]) as f: read_group_headers = [line.split() for line in f] temp_bam_list = [] for i in read_group_and_fastqs: if i != [""]: rg_id = i[0] fastq_files = i[1:] for j in read_group_headers: for k in j: if k[0:2] == 'ID': if k[3:] == rg_id: rg_tag = "\t".join(j) break temp_bam = assemble.bwa_mem_mapping_sambamba( bwa_path, samtools_path, sambamba_path, new_reference, "{}/{}.sex_chroms.{}".format( bam_dir, sample_id, rg_id), fastq_files, cpus, rg_tag, [str(x).strip() for x in bwa_flags.split()]) temp_bam_list.append(temp_bam) if len(temp_bam_list) < 2: new_bam = temp_bam_list[0] else: new_bam = bam.samtools_merge( samtools_path, temp_bam_list, "{}/{}.sex_chroms".format( bam_dir, sample_id), cpus) else: fastq_files = read_group_and_fastqs[0][1:] new_bam = assemble.bwa_mem_mapping_sambamba( bwa_path, samtools_path, sambamba_path, new_reference, "{}/{}.sex_chroms.{}".format( bam_dir, sample_id, rg_id), fastq_files, cpus, "None", [str(x).strip() for x in bwa_flags.split()]) return new_bam
[docs]def swap_sex_chroms( input_bam_obj, new_bam_obj, samtools_path, sambamba_path, x_chromosome, y_chromosome, bam_dir, sample_id, cpus, xyalign_params): """ Switches sex chromosmes from new_bam_file with those in original bam file Parameters ---------- input_bam_obj : bam.BamFile() object Original input bam file object new_bam_obj : bam.BamFile() object Bam file object containing newly mapped sex chromosomes (to insert) samtools_path : str Path/command to call samtools sambamba_path : str Path/command to call sambamba x_chromosome : list X-linked scaffolds y_chromosome : str Y-linked scaffolds bam_dir : str Path to bam output directory sample_id : str cpus : int Number of threads/cpus xyalign_params : dict Dictionary of xyalign_params to add to bam header Returns ------- str Path to new bam file containing original autosomes and new sex chromosomes """ merged_bam = bam.switch_sex_chromosomes_sambamba( samtools_path, sambamba_path, input_bam_obj.filepath, new_bam_obj.filepath, x_chromosome + y_chromosome, bam_dir, sample_id, cpus, xyalign_params) return merged_bam
[docs]def main(): # Version - placeholder for now - need to incorporate it into __init__.py citation = """ XYalign: Inferring Sex Chromosome Ploidy in NGS Data Timothy H Webster, Tanya Phung, Madeline Couse, Bruno Grande, Eric Karlins, Phillip Richmond, Whitney Whitford, Melissa A. Wilson Sayres 2017 Version: {} """.format(__version__) # Start timer xyalign_start = time.time() # Grab arguments args = parse_args() # Create directory structure if not already in place utils.validate_dir(args.output_dir, "fastq") utils.validate_dir(args.output_dir, "bam") utils.validate_dir(args.output_dir, "reference") utils.validate_dir(args.output_dir, "bed") utils.validate_dir(args.output_dir, "vcf") utils.validate_dir(args.output_dir, "plots") utils.validate_dir(args.output_dir, "results") utils.validate_dir(args.output_dir, "logfiles") # Set up logfile logfile_path = os.path.join(args.output_dir, "logfiles") if args.logfile is not None: logfile = os.path.join( logfile_path, args.logfile) else: logfile = os.path.join( logfile_path, "{}_xyalign.log".format( args.sample_id)) # Initiate logging logger = logging.getLogger("xyalign") logger.setLevel(logging.DEBUG) # Create File Handler fh = logging.FileHandler(logfile, mode="w") fh.setLevel(logging.DEBUG) # Create Console Handler ch = logging.StreamHandler() if args.reporting_level == "DEBUG": ch.setLevel(logging.DEBUG) elif args.reporting_level == "INFO": ch.setLevel(logging.INFO) elif args.reporting_level == "ERROR": ch.setLevel(logging.ERROR) elif args.reporting_level == "CRITICAL": ch.setLevel(logging.CRITICAL) else: ch.setLevel(logging.INFO) # Set Formatter formatter = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s - %(message)s') fh.setFormatter(formatter) ch.setFormatter(formatter) # Add handlers to logger logger.addHandler(fh) logger.addHandler(ch) # Log citation logger.info("{}".format(citation)) # Set up param dictionary xyalign_params_dict = {'ID': 'XYalign', 'VN': __version__, 'CL': []} p = "" for arg in args.__dict__: p = p + "{}={}, ".format(arg, args.__dict__[arg]) xyalign_params_dict['CL'].append("{}={}".format(arg, args.__dict__[arg])) # Log parameters and pipeline start logger.info("Parameters: {}".format(p)) logger.info("Beginning XYalign") # Check for external programs logger.info("Checking external programs") utils.validate_external_prog(args.repairsh_path, "bbmap's repair.sh") utils.validate_external_prog(args.bedtools_path, "bedtools") utils.validate_external_prog(args.bwa_path, "bwa") utils.validate_external_prog(args.platypus_path, "platypus") utils.validate_external_prog(args.samtools_path, "samtools") utils.validate_external_prog(args.sambamba_path, "sambamba") logger.info("All external program paths successfully checked") # Setup output paths fastq_path = os.path.join(args.output_dir, "fastq") bam_path = os.path.join(args.output_dir, "bam") reference_path = os.path.join(args.output_dir, "reference") bed_path = os.path.join(args.output_dir, "bed") vcf_path = os.path.join(args.output_dir, "vcf") plots_path = os.path.join(args.output_dir, "plots") results_path = os.path.join(args.output_dir, "results") # Create paths for output files # reference-related if args.xx_ref_out_name is not None: xx_out = os.path.join(reference_path, args.xx_ref_out_name) elif args.xx_ref_out is not None: xx_out = args.xx_ref_out else: xx_out = os.path.join(reference_path, "xyalign_noY.masked.fa") if args.xy_ref_out_name is not None: xy_out = os.path.join(reference_path, args.xy_ref_out_name) elif args.xy_ref_out is not None: xy_out = args.xy_ref_out else: xy_out = os.path.join(reference_path, "xyalign_withY.masked.fa") # variant/vcf related noprocessing_vcf = os.path.join( vcf_path, "{}.noprocessing.vcf".format( args.sample_id)) postprocessing_vcf = os.path.join( vcf_path, "{}.postprocessing.vcf".format( args.sample_id)) if args.platypus_logfile is not None: plat_log = args.platypus_logfile else: plat_log = args.sample_id noprocessing_vcf_log = os.path.join( logfile_path, "{}_noprocessing_platypus.log".format( plat_log)) postprocessing_vcf_log = os.path.join( logfile_path, "{}_postprocessing_platypus.log".format( plat_log)) readbalance_prefix_noprocessing = os.path.join( plots_path, "{}_noprocessing".format(args.sample_id)) readbalance_prefix_postprocessing = os.path.join( plots_path, "{}_postprocessing".format(args.sample_id)) # Depth/mapq related chrom_stats_mapq = os.path.join( results_path, "{}_chrom_stats_mapq.txt".format(args.sample_id)) chrom_stats_depth = os.path.join( results_path, "{}_chrom_stats_depth.txt".format(args.sample_id)) chrom_stats_count = os.path.join( results_path, "{}_chrom_stats_count.txt".format(args.sample_id)) depth_mapq_prefix_noprocessing = os.path.join( plots_path, "{}_noprocessing".format(args.sample_id)) depth_mapq_prefix_postprocessing = os.path.join( plots_path, "{}_postprocessing".format(args.sample_id)) data_frame_preprocessing = os.path.join( bed_path, "{}_full_dataframe_depth_mapq_preprocessing.csv".format( args.sample_id)) data_frame_postprocessing = os.path.join( bed_path, "{}_full_dataframe_depth_mapq_postprocessing.csv".format( args.sample_id)) data_frame_readbalance_preprocessing = os.path.join( bed_path, "{}_full_dataframe_readbalance_preprocessing.csv".format( args.sample_id)) data_frame_readbalance_postprocessing = os.path.join( bed_path, "{}_full_dataframe_readbalance_postprocessing.csv".format( args.sample_id)) high_prefix = "{}_highquality_preprocessing".format(args.sample_id) output_bed_high_preprocessing = os.path.join( bed_path, "{}.bed".format(high_prefix)) low_prefix = "{}_lowquality_preprocessing".format(args.sample_id) output_bed_low_preprocessing = os.path.join( bed_path, "{}.bed".format(low_prefix)) high_prefix_postprocessing = "{}_highquality_postprocessing".format( args.sample_id) output_bed_high_postprocessing = os.path.join( bed_path, "{}.bed".format(high_prefix_postprocessing)) low_prefix_postprocessing = "{}_lowquality_postprocessing".format( args.sample_id) output_bed_low_postprocessing = os.path.join( bed_path, "{}.bed".format(low_prefix_postprocessing)) # Set cleanup if args.no_cleanup is True: cleanup = False else: cleanup = True ###################################### # Run XYalign # ###################################### # Reference Prep Only if args.PREPARE_REFERENCE is True: logger.info( "PREPARE_REFERENCE set, so only preparing reference fastas.") ref = reftools.RefFasta(args.ref, args.samtools_path, args.bwa_path) ref_prep( ref_obj=ref, ref_mask=args.reference_mask, ref_dir=reference_path, xx=xx_out, xy=xy_out, y_chromosome=args.y_chromosome, samtools_path=args.samtools_path, bwa_path=args.bwa_path, bwa_index=args.bwa_index) logger.info("PREPARE_REFERENCE complete.") logger.info("XYalign complete. Elapsed time: {} seconds".format( time.time() - xyalign_start)) logging.shutdown() sys.exit(0) if args.CHROM_STATS is True: logger.info( "CHROM_STATS set, will iterate through bam files to calculate " "chromosome-wide averages.") bam_list = [bam.BamFile(x, args.samtools_path) for x in args.bam] chrom_stats_results = chrom_stats( bam_obj_list=bam_list, chrom_list=args.chromosomes, use_counts=args.use_counts) if args.use_counts is False: cs_depth_dict = chrom_stats_results[0] cs_mapq_dict = chrom_stats_results[1] with open(chrom_stats_depth, "w") as f: for i in cs_depth_dict: out_line = "\t".join([str(x) for x in cs_depth_dict[i]]) f.write("{}\n".format(out_line)) with open(chrom_stats_mapq, "w") as f: for i in cs_mapq_dict: out_line = "\t".join([str(x) for x in cs_mapq_dict[i]]) f.write("{}\n".format(out_line)) else: cs_count_dict = chrom_stats_results[0] with open(chrom_stats_count, "w") as f: for i in cs_count_dict: out_line = "\t".join([str(x) for x in cs_count_dict[i]]) f.write("{}\n".format(out_line)) logger.info("CHROM_STATS complete.") logger.info("XYalign complete. Elapsed time: {} seconds".format( time.time() - xyalign_start)) logging.shutdown() sys.exit(0) input_bam = bam.BamFile(args.bam[0], args.samtools_path) if args.chromosomes == ["ALL"] or args.chromosomes == ["all"]: input_chromosomes = list(input_bam.chromosome_names()) else: input_chromosomes = args.chromosomes if any( [args.ANALYZE_BAM, args.CHARACTERIZE_SEX_CHROMS, args.STRIP_READS]) is True: missing_chroms = input_bam.check_chrom_in_bam(input_chromosomes) if len(missing_chroms) != 0: logger.error( "One or more chromosomes provided via --chromosomes not " "present in bam file. Exiting.") logging.shutdown() sys.exit(1) # Strip reads only # This module is isolated first because it does not require a reference fasta if args.STRIP_READS is True: logger.info( "STRIP_READS set, so only stripping reads from {}.".format( input_bam.filepath)) if args.chromosomes == ["ALL"] or args.chromosomes == ["all"]: stripped_fastqs = input_bam.strip_reads( args.repairsh_path, args.shufflesh_path, args.single_end, fastq_path, args.sample_id, [], args.xmx, args.fastq_compression, cleanup, args.read_group_id) else: stripped_fastqs = input_bam.strip_reads( args.repairsh_path, args.shufflesh_path, args.single_end, fastq_path, args.sample_id, input_chromosomes, args.xmx, args.fastq_compression, cleanup, args.read_group_id) logger.info("STRIP_READS complete. Output in {}".format(fastq_path)) logger.info("XYalign complete. Elapsed time: {} seconds".format( time.time() - xyalign_start)) logging.shutdown() sys.exit(0) # Other modules ref = reftools.RefFasta(args.ref, args.samtools_path, args.bwa_path) # Check to ensure bam and fasta are compatible and imports work if args.skip_compatibility_check is False: compatible = utils.check_bam_fasta_compatibility(input_bam, ref) if compatible is False: logger.error( "Exiting due to compatibility issues between {} and {}. Check: " "1) that this fasta was used when generating this bam, 2) that " "sequence lengths and names are identical between the two files. " "You can check 2) by comparing the bam header (e.g., " "samtools -H {}) with a sequence dictionary (.dict) created for " "{}. If you think this is an error, you can use the flag " "--skip_compatibility_check.".format( input_bam.filepath, ref.filepath, input_bam.filepath, ref.filepath)) logging.shutdown() sys.exit(1) # Stats Only if args.ANALYZE_BAM is True: logger.info( "ANALYZE_BAM set, so only running steps required " "for bam analysis.") if args.platypus_calling == "both" or args.platypus_calling == "before": call_variants = True else: call_variants = False bam_analysis( input_bam_obj=input_bam, platypus_calling=call_variants, platypus_path=args.platypus_path, vcf_log=noprocessing_vcf_log, ref_obj=ref, input_chroms=input_chromosomes, cpus=args.cpus, out_vcf=noprocessing_vcf, no_variant_plots=args.no_variant_plots, window_size=args.window_size, target_bed=args.target_bed, sample_id=args.sample_id, readbalance_prefix=readbalance_prefix_noprocessing, variant_site_quality=args.variant_site_quality, variant_genotype_quality=args.variant_genotype_quality, variant_depth=args.variant_depth, marker_size=args.marker_size, marker_transparency=args.marker_transparency, homogenize_read_balance=args.homogenize_read_balance, data_frame_readbalance=data_frame_readbalance_preprocessing, min_variant_count=args.min_variant_count, no_bam_analysis=args.no_bam_analysis, ignore_duplicates=args.ignore_duplicates, exact_depth=args.exact_depth, whole_genome_threshold=args.whole_genome_threshold, mapq_cutoff=args.mapq_cutoff, min_depth_filter=args.min_depth_filter, max_depth_filter=args.max_depth_filter, depth_mapq_prefix=depth_mapq_prefix_noprocessing, bam_data_frame=data_frame_preprocessing, output_bed_high=output_bed_high_preprocessing, output_bed_low=output_bed_low_preprocessing, use_bed_for_platypus=False, coordinate_scale=args.coordinate_scale, fixed=args.include_fixed) logger.info("ANALYZE_BAM complete.") logger.info("XYalign complete. Elapsed time: {} seconds".format( time.time() - xyalign_start)) logging.shutdown() sys.exit(0) # Ploidy Estimation Only elif args.CHARACTERIZE_SEX_CHROMS is True: logger.info( "CHARACTERIZE_SEX_CHROMS set, so only running steps required " "for to characterize sex chromosome complement. Note that " "this involve both playtpus calling and bam analysis too.") if args.platypus_calling == "both" or args.platypus_calling == "before": call_variants = True else: call_variants = False bam_dfs = bam_analysis( input_bam_obj=input_bam, platypus_calling=call_variants, platypus_path=args.platypus_path, vcf_log=noprocessing_vcf_log, ref_obj=ref, input_chroms=input_chromosomes, cpus=args.cpus, out_vcf=noprocessing_vcf, no_variant_plots=args.no_variant_plots, window_size=args.window_size, target_bed=args.target_bed, sample_id=args.sample_id, readbalance_prefix=readbalance_prefix_noprocessing, variant_site_quality=args.variant_site_quality, variant_genotype_quality=args.variant_genotype_quality, variant_depth=args.variant_depth, marker_size=args.marker_size, marker_transparency=args.marker_transparency, homogenize_read_balance=args.homogenize_read_balance, data_frame_readbalance=data_frame_readbalance_preprocessing, min_variant_count=args.min_variant_count, no_bam_analysis=args.no_bam_analysis, ignore_duplicates=args.ignore_duplicates, exact_depth=args.exact_depth, whole_genome_threshold=args.whole_genome_threshold, mapq_cutoff=args.mapq_cutoff, min_depth_filter=args.min_depth_filter, max_depth_filter=args.max_depth_filter, depth_mapq_prefix=depth_mapq_prefix_noprocessing, bam_data_frame=data_frame_preprocessing, output_bed_high=output_bed_high_preprocessing, output_bed_low=output_bed_low_preprocessing, use_bed_for_platypus=False, coordinate_scale=args.coordinate_scale, fixed=args.include_fixed) ploidy_results = ploidy_analysis( passing_df=bam_dfs[0], failing_df=bam_dfs[1], no_perm_test=args.no_perm_test, no_ks_test=args.no_ks_test, no_bootstrap=args.no_bootstrap, input_chroms=input_chromosomes, x_chromosome=args.x_chromosome, y_chromosome=args.y_chromosome, results_dir=results_path, num_permutations=args.num_permutations, num_bootstraps=args.num_bootstraps, sample_id=args.sample_id) logger.info("CHARACTERIZE_SEX_CHROMS complete.") logger.info("XYalign complete. Elapsed time: {} seconds".format( time.time() - xyalign_start)) logging.shutdown() sys.exit(0) # Remapping Only elif args.REMAPPING is True: logger.info( "REMAPPING set, so only running steps required for remapping. " "--y_present or --y_absent must be set") if args.y_present is True: y_present = True logger.info("--y_present provided, so remapping for XY individual") elif args.y_absent is True: logger.info("--y_absent provided, so remapping for XX individual") y_present = False else: logger.error( "--y_present or --y_absent required for --REMAPPING. " "Exiting.") logging.shutdown() sys.exit(1) if args.xx_ref_in is None or args.xy_ref_in is None: logger.info( "Input masked reference not provided for both " "XX and XY mapping, so creating both") masked_refs = ref_prep( ref_obj=ref, ref_mask=args.reference_mask, ref_dir=reference_path, xx=xx_out, xy=xy_out, y_chromosome=args.y_chromosome, samtools_path=args.samtools_path, bwa_path=args.bwa_path, bwa_index=True) else: xx = reftools.RefFasta( args.xx_ref_in, args.samtools_path, args.bwa_path) xy = reftools.RefFasta( args.xy_ref_in, args.samtools_path, args.bwa_path) xx.conditional_index_bwa() xx.conditional_seq_dict() xy.conditional_index_bwa() xy.conditional_seq_dict() masked_refs = (xx, xy) sex_chrom_bam = bam.BamFile( remapping( input_bam_obj=input_bam, y_pres=y_present, masked_references=masked_refs, samtools_path=args.samtools_path, sambamba_path=args.sambamba_path, repairsh_path=args.repairsh_path, shufflesh_path=args.shufflesh_path, bwa_path=args.bwa_path, bwa_flags=args.bwa_flags, single_end=args.single_end, bam_dir=bam_path, fastq_dir=fastq_path, sample_id=args.sample_id, x_chromosome=args.x_chromosome, y_chromosome=args.y_chromosome, cpus=args.cpus, xmx=args.xmx, fastq_compression=args.fastq_compression, cleanup=cleanup, read_group_id=args.read_group_id), args.samtools_path) if args.sex_chrom_bam_only is True: final_bam = sex_chrom_bam else: final_bam = bam.BamFile( swap_sex_chroms( input_bam_obj=input_bam, new_bam_obj=sex_chrom_bam, samtools_path=args.samtools_path, sambamba_path=args.sambamba_path, x_chromosome=args.x_chromosome, y_chromosome=args.y_chromosome, bam_dir=bam_path, sample_id=args.sample_id, cpus=args.cpus, xyalign_params=xyalign_params_dict), args.samtools_path) logger.info("REMAPPING complete.") logger.info("XYalign complete. Elapsed time: {} seconds".format( time.time() - xyalign_start)) logging.shutdown() sys.exit(0) # Full Pipeline else: logger.info( "Running entire XYalign pipeline") missing_chroms = input_bam.check_chrom_in_bam(input_chromosomes) if len(missing_chroms) != 0: logger.error( "One or more chromosomes provided via --chromosomes not " "present in bam file. Exiting.") logging.shutdown() sys.exit(1) if args.xx_ref_in is None or args.xy_ref_in is None: logger.info( "Input masked reference not provided for both " "XX and XY mapping, so creating both") masked_refs = ref_prep( ref_obj=ref, ref_mask=args.reference_mask, ref_dir=reference_path, xx=xx_out, xy=xy_out, y_chromosome=args.y_chromosome, samtools_path=args.samtools_path, bwa_path=args.bwa_path, bwa_index=True) else: xx = reftools.RefFasta( args.xx_ref_in, args.samtools_path, args.bwa_path) xy = reftools.RefFasta( args.xy_ref_in, args.samtools_path, args.bwa_path) xx.conditional_index_bwa() xx.conditional_seq_dict() xy.conditional_index_bwa() xy.conditional_seq_dict() masked_refs = (xx, xy) if args.platypus_calling == "both" or args.platypus_calling == "before": call_variants = True else: call_variants = False bam_dfs = bam_analysis( input_bam_obj=input_bam, platypus_calling=call_variants, platypus_path=args.platypus_path, vcf_log=noprocessing_vcf_log, ref_obj=ref, input_chroms=input_chromosomes, cpus=args.cpus, out_vcf=noprocessing_vcf, no_variant_plots=args.no_variant_plots, window_size=args.window_size, target_bed=args.target_bed, sample_id=args.sample_id, readbalance_prefix=readbalance_prefix_noprocessing, variant_site_quality=args.variant_site_quality, variant_genotype_quality=args.variant_genotype_quality, variant_depth=args.variant_depth, marker_size=args.marker_size, marker_transparency=args.marker_transparency, homogenize_read_balance=args.homogenize_read_balance, data_frame_readbalance=data_frame_readbalance_preprocessing, min_variant_count=args.min_variant_count, no_bam_analysis=args.no_bam_analysis, ignore_duplicates=args.ignore_duplicates, exact_depth=args.exact_depth, whole_genome_threshold=args.whole_genome_threshold, mapq_cutoff=args.mapq_cutoff, min_depth_filter=args.min_depth_filter, max_depth_filter=args.max_depth_filter, depth_mapq_prefix=depth_mapq_prefix_noprocessing, bam_data_frame=data_frame_preprocessing, output_bed_high=output_bed_high_preprocessing, output_bed_low=output_bed_low_preprocessing, use_bed_for_platypus=False, coordinate_scale=args.coordinate_scale, fixed=args.include_fixed) ploidy_results = ploidy_analysis( passing_df=bam_dfs[0], failing_df=bam_dfs[1], no_perm_test=args.no_perm_test, no_ks_test=args.no_ks_test, no_bootstrap=args.no_bootstrap, input_chroms=input_chromosomes, x_chromosome=args.x_chromosome, y_chromosome=args.y_chromosome, results_dir=results_path, num_permutations=args.num_permutations, num_bootstraps=args.num_bootstraps, sample_id=args.sample_id) if args.y_present is True: y_present = True logger.info("--y_present provided, so remapping for XY individual") elif args.y_absent is True: y_present = False logger.info("--y_absent provided, so remapping for XX individual") else: if ploidy_results["boot"][2][2] > args.sex_chrom_calling_threshold: y_present = False logger.info( "X/Y depth ratio ({}) > {}. Y inferred to be absent.".format( ploidy_results["boot"][2][2], args.sex_chrom_calling_threshold)) else: y_present = True logger.info( "X/Y depth ratio ({}) <= {}. Y inferred to be present.".format( ploidy_results["boot"][2][2], args.sex_chrom_calling_threshold)) sex_chrom_bam = bam.BamFile( remapping( input_bam_obj=input_bam, y_pres=y_present, masked_references=masked_refs, samtools_path=args.samtools_path, sambamba_path=args.sambamba_path, repairsh_path=args.repairsh_path, shufflesh_path=args.shufflesh_path, bwa_path=args.bwa_path, bwa_flags=args.bwa_flags, single_end=args.single_end, bam_dir=bam_path, fastq_dir=fastq_path, sample_id=args.sample_id, x_chromosome=args.x_chromosome, y_chromosome=args.y_chromosome, cpus=args.cpus, xmx=args.xmx, fastq_compression=args.fastq_compression, cleanup=cleanup, read_group_id=args.read_group_id), args.samtools_path) if args.sex_chrom_bam_only is True: final_bam = sex_chrom_bam else: final_bam = bam.BamFile( swap_sex_chroms( input_bam_obj=input_bam, new_bam_obj=sex_chrom_bam, samtools_path=args.samtools_path, sambamba_path=args.sambamba_path, x_chromosome=args.x_chromosome, y_chromosome=args.y_chromosome, bam_dir=bam_path, sample_id=args.sample_id, cpus=args.cpus, xyalign_params=xyalign_params_dict), args.samtools_path) if args.platypus_calling == "both" or args.platypus_calling == "after": call_variants = True else: call_variants = False bam_analysis( input_bam_obj=final_bam, platypus_calling=call_variants, platypus_path=args.platypus_path, vcf_log=postprocessing_vcf_log, ref_obj=ref, input_chroms=input_chromosomes, cpus=args.cpus, out_vcf=postprocessing_vcf, no_variant_plots=args.no_variant_plots, window_size=args.window_size, target_bed=args.target_bed, sample_id=args.sample_id, readbalance_prefix=readbalance_prefix_postprocessing, variant_site_quality=args.variant_site_quality, variant_genotype_quality=args.variant_genotype_quality, variant_depth=args.variant_depth, marker_size=args.marker_size, marker_transparency=args.marker_transparency, homogenize_read_balance=args.homogenize_read_balance, data_frame_readbalance=data_frame_readbalance_postprocessing, min_variant_count=args.min_variant_count, no_bam_analysis=args.no_bam_analysis, ignore_duplicates=args.ignore_duplicates, exact_depth=args.exact_depth, whole_genome_threshold=args.whole_genome_threshold, mapq_cutoff=args.mapq_cutoff, min_depth_filter=args.min_depth_filter, max_depth_filter=args.max_depth_filter, depth_mapq_prefix=depth_mapq_prefix_postprocessing, bam_data_frame=data_frame_postprocessing, output_bed_high=output_bed_high_postprocessing, output_bed_low=output_bed_low_postprocessing, use_bed_for_platypus=True, coordinate_scale=args.coordinate_scale, fixed=args.include_fixed) logger.info("XYalign complete. Elapsed time: {} seconds".format( time.time() - xyalign_start)) logging.shutdown() sys.exit(0)
if __name__ == "__main__": main()