# XYalign main program
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import argparse
import collections
import csv
import logging
import os
import subprocess
import sys
import time
import pandas as pd
import xyalign.assemble as assemble
import xyalign.bam as bam
import xyalign.ploidy as ploidy
import xyalign.reftools as reftools
import xyalign.utils as utils
import xyalign.variants as variants
# from xyalign import assemble, bam, ploidy, reftools, utils, variants
# Grab XYalign version from _version.py in the xyalign directory
dir = os.path.dirname(__file__)
version_py = os.path.join(dir, "_version.py")
exec(open(version_py).read())
[docs]def parse_args():
"""
Parse command-line arguments
Returns
-------
Parser argument namespace
"""
parser = argparse.ArgumentParser(description="XYalign")
parser.add_argument(
"--bam", nargs="*", help="Full path to input bam files. If more "
"than one provided, only the first will be used for modules other "
"than --CHROM_STATS")
parser.add_argument(
"--cram", nargs="*", help="Full path to input cram files. If more "
"than one provided, only the first will be used for modules other "
"than --CHROM_STATS. Not currently supported.")
parser.add_argument(
"--sam", nargs="*", help="Full path to input sam files. If more "
"than one provided, only the first will be used for modules other "
"than --CHROM_STATS. Not currently supported.")
parser.add_argument(
"--ref", required=True,
help="REQUIRED. Path to reference sequence (including file name).")
parser.add_argument(
"--output_dir", "-o", required=True,
help="REQUIRED. Output directory. XYalign will create a directory "
"structure within this directory")
parser.add_argument(
"--chromosomes", "-c", nargs="*", default=[None],
help="Chromosomes to analyze (names must match reference exactly). "
"For humans, we recommend at least chr19, chrX, chrY. Generally, we "
"suggest including the sex chromosomes and at least one autosome. "
"To analyze all chromosomes use '--chromosomes ALL' or "
"'--chromosomes all'.")
parser.add_argument(
"--x_chromosome", "-x", nargs="*", default=[None],
help="Names of x-linked scaffolds in reference fasta (must match "
"reference exactly). ")
parser.add_argument(
"--y_chromosome", "-y", nargs="*", default=[None],
help="Names of y-linked scaffolds in reference fasta (must match "
"reference exactly). Defaults to chrY. Give None if using an assembly "
"without a Y chromosome")
parser.add_argument(
"--sample_id", "-id", default="sample",
help="Name/ID of sample - for use in plot titles and file naming. "
"Default is sample")
parser.add_argument(
"--cpus", type=int, default=1,
help="Number of cores/threads to use. Default is 1")
parser.add_argument(
"--xmx", default="None",
help="Memory to be provided to java programs via -Xmx. E.g., use the "
"flag '--xmx 4g' to pass '-Xmx4g' as a flag when running java "
"programs (currently just repair.sh). Default is 'None' (i.e., nothing "
"provided on the command line), which will allow repair.sh to "
"automatically allocate memory. Note that if you're using --STRIP_READS "
"on deep coverage whole genome data, you might need quite a bit of memory, "
"e.g. '--xmx 16g', '--xmx 32g', or more depending on how many reads "
"are present per read group.")
parser.add_argument(
"--fastq_compression", default=3, type=int, choices=range(0, 10),
help="Compression level for fastqs output from repair.sh. Between "
"(inclusive) 0 and 9. Default is 3. 1 through 9 indicate "
"compression levels. If 0, fastqs will be uncompressed.")
parser.add_argument(
"--single_end", action="store_true", default=False,
help="Include flag if reads are single-end and NOT paired-end.")
parser.add_argument(
"--version", "-V", action="version",
version="XYalign {}".format(__version__),
help="Print version and exit.")
parser.add_argument(
"--no_cleanup", action="store_true", default=False,
help="Include flag to preserve temporary files.")
# Options to run specific parts of the pipeline
pipeline_group = parser.add_mutually_exclusive_group(required=False)
pipeline_group.add_argument(
"--PREPARE_REFERENCE", action="store_true", default=False,
help="This flag will limit XYalign to only preparing reference fastas "
"for individuals with and without Y chromosomes. These fastas can "
"then be passed with each sample to save subsequent processing time.")
pipeline_group.add_argument(
"--CHROM_STATS", action="store_true", default=False,
help="This flag will limit XYalign to only analyzing provided bam files "
"for depth and mapq across entire chromosomes.")
pipeline_group.add_argument(
"--ANALYZE_BAM", action="store_true", default=False,
help="This flag will limit XYalign to only analyzing the bam file for "
"depth, mapq, and (optionally) read balance and outputting plots.")
pipeline_group.add_argument(
"--CHARACTERIZE_SEX_CHROMS", action="store_true", default=False,
help="This flag will limit XYalign to the steps required to "
"characterize sex chromosome content (i.e., analyzing the bam "
"for depth, mapq, and read balance and running statistical tests "
"to help infer ploidy)")
pipeline_group.add_argument(
"--REMAPPING", action="store_true", default=False,
help="This flag will limit XYalign to only the steps required to "
"strip reads and remap to masked references. If masked references "
"are not provided, they will be created.")
pipeline_group.add_argument(
"--STRIP_READS", action="store_true", default=False,
help="This flag will limit XYalign to only the steps required to "
"strip reads from a provided bam file.")
# Logging options
parser.add_argument(
"--logfile", default=None,
help="Name of logfile. Will overwrite if exists. Default is "
"sample_xyalign.log")
parser.add_argument(
"--reporting_level", default='INFO',
choices=["DEBUG", "INFO", "ERROR", "CRITICAL"],
help="Set level of messages printed to console. Default is 'INFO'. "
"Choose from (in decreasing amount of reporting) DEBUG, INFO, ERROR "
"or CRITICAL")
# Program paths
parser.add_argument(
"--platypus_path", default="platypus",
help="Path to platypus. Default is 'platypus'. If platypus is not "
"directly callable (e.g., '/path/to/platypus' or "
"'/path/to/Playpus.py'), then provide path to python as well (e.g., "
"'/path/to/python /path/to/platypus'). In addition, be sure provided "
"python is version 2. See the documentation for more information "
"about setting up an anaconda environment.")
parser.add_argument(
"--bwa_path", default="bwa",
help="Path to bwa. Default is 'bwa'")
parser.add_argument(
"--samtools_path", default="samtools",
help="Path to samtools. Default is 'samtools'")
parser.add_argument(
"--repairsh_path", default="repair.sh",
help="Path to bbmap's repair.sh script. Default is 'repair.sh'")
parser.add_argument(
"--shufflesh_path", default="repair.sh",
help="Path to bbmap's shuffle.sh script. Default is 'shuffle.sh'")
parser.add_argument(
"--sambamba_path", default="sambamba",
help="Path to sambamba. Default is 'sambamba'")
parser.add_argument(
"--bedtools_path", default="bedtools",
help="Path to bedtools. Default is 'bedtools'")
# Options for turning on/off parts of the pipeline
parser.add_argument(
"--platypus_calling", default="both",
choices=["both", "none", "before", "after"],
help="Platypus calling withing the pipeline "
"(before processing, after processing, both, "
"or neither). Options: both, none, before, after.")
parser.add_argument(
"--no_variant_plots", action="store_true", default=False,
help="Include flag to prevent plotting read balance from VCF files.")
parser.add_argument(
"--no_bam_analysis", action="store_true", default=False,
help="Include flag to prevent depth/mapq analysis of bam file. "
"Used to isolate platypus_calling.")
parser.add_argument(
"--skip_compatibility_check", action="store_true", default=False,
help="Include flag to prevent check of compatibility between input bam "
"and reference fasta")
parser.add_argument(
"--no_perm_test", action="store_true", default=False,
help="Include flag to turn off permutation tests.")
parser.add_argument(
"--no_ks_test", action="store_true", default=False,
help="Include flag to turn off KS Two Sample tests.")
parser.add_argument(
"--no_bootstrap", action="store_true", default=False,
help="Include flag to turn off bootstrap analyses. Requires either "
"--y_present, --y_absent, or --sex_chrom_calling_threshold "
"if running full pipeline.")
# Variant Flags
parser.add_argument(
"--variant_site_quality", "-vsq", type=int, default=30,
help="Consider all SNPs with a site quality (QUAL) greater than or "
"equal to this value. Default is 30.")
parser.add_argument(
"--variant_genotype_quality", "-vgq", type=int, default=30,
help="Consider all SNPs with a sample genotype quality greater than or "
"equal to this value. Default is 30.")
parser.add_argument(
"--variant_depth", "-vd", type=int, default=4,
help="Consider all SNPs with a sample depth greater than or "
"equal to this value. Default is 4.")
parser.add_argument(
"--platypus_logfile", default=None,
help="Prefix to use for Platypus log files. Will default to the "
"sample_id argument provided")
parser.add_argument(
"--homogenize_read_balance", type=bool, default=False,
help="If True, read balance values will be transformed by subtracting "
"each value from 1. For example, 0.25 and 0.75 would be treated "
"equivalently. Default is False.")
parser.add_argument(
"--min_variant_count", type=int, default=1,
help="Minimum number of variants in a window for the read balance of "
"that window to be plotted. Note that this does not affect plotting "
"of variant counts. Default is 1, though we note that many window "
"averages will be meaningless at this setting.")
# Reference-related Flags
parser.add_argument(
"--reference_mask", nargs="*",
help="Bed file containing regions to replace with Ns in the sex "
"chromosome reference. Examples might include the pseudoautosomal "
"regions on the Y to force all mapping/calling on those regions of the "
"X chromosome. Default is None.")
parser.add_argument(
"--xx_ref_out_name", default=None,
help="Desired name for masked output fasta for "
"samples WITHOUT a Y chromosome (e.g., XX, XXX, XO, etc.). "
"Defaults to 'xyalign_noY.masked.fa'. Will be output "
"in the XYalign reference directory.")
parser.add_argument(
"--xy_ref_out_name", default=None,
help="Desired name for masked output fasta for "
"samples WITH a Y chromosome (e.g., XY, XXY, etc.). "
"Defaults to 'xyalign_withY.masked.fa'. Will be output "
"in the XYalign reference directory.")
parser.add_argument(
"--xx_ref_out", default=None,
help="Desired path to and name of masked output fasta for "
"samples WITHOUT a Y chromosome (e.g., XX, XXX, XO, etc.). "
"Overwrites if exists. "
"Use if you would like output somewhere other than XYalign reference "
"directory. Otherwise, use --xx_ref_name.")
parser.add_argument(
"--xy_ref_out", default=None,
help="Desired path to and name of masked output fasta for "
"samples WITH a Y chromosome (e.g., XY, XXY, etc.). Overwrites if exists. "
"Use if you would like output somewhere other than XYalign reference "
"directory. Otherwise, use --xy_ref_name.")
parser.add_argument(
"--xx_ref_in", default=None,
help="Path to preprocessed reference fasta to be used for remapping in "
"X0 or XX samples. Default is None. If none, will produce a "
"sample-specific reference for remapping.")
parser.add_argument(
"--xy_ref_in", default=None,
help="Path to preprocessed reference fasta to be used for remapping in "
"samples containing Y chromosome. Default is None. If none, will "
"produce a sample-specific reference for remapping.")
parser.add_argument(
"--bwa_index", type=bool, default=False,
help="If True, index with BWA during PREPARE_REFERENCE. Only relevant"
"when running the PREPARE_REFERENCE module by itself. Default is False.")
# Mapping/remapping arguments
parser.add_argument(
"--read_group_id", default="xyalign", type=str,
help="If read groups are present in a bam file, they are used by default in "
"remapping steps. However, if read groups are not present in a file, "
"there are two options for proceeding. If '--read_group_id None' is "
"provided (case sensitive), then no read groups will be used in "
"subsequent mapping steps. Otherwise, any other string provided to "
"this flag will be used as a read group ID. "
"Default is '--read_group_id xyalign'")
parser.add_argument(
"--bwa_flags", type=str, default="",
help="Provide a string (in quotes, with spaces between arguments) "
"for additional flags desired for BWA mapping (other than -R and -t). "
"Example: '-M -T 20 -v 4'. Note that those are spaces between "
"arguments.")
parser.add_argument(
"--sex_chrom_bam_only", action="store_true", default=False,
help="This flag skips merging the new sex chromosome bam file back "
"into the original bam file (i.e., sex chrom swapping). This will "
"output a bam file containing only the newly remapped sex chromosomes.")
parser.add_argument(
"--sex_chrom_calling_threshold", type=float, default=2.0,
help="This is the *maximum* filtered X/Y depth ratio for an individual "
"to be considered as having heterogametic sex chromsomes (e.g., XY) for "
"the REMAPPING module of XYalign. "
"Note here that X and Y chromosomes are simply the chromosomes that "
"have been designated as X and Y via --x_chromosome and --y_chromosome. "
"Keep in mind that the ideal threshold will vary according to sex "
"determination mechanism, sequence homology between the sex chromosomes, "
"reference genome, sequencing methods, etc. See documentation for more "
"detail. Default is 2.0, which we found to be reasonable for exome, "
"low-coverage whole-genome, and high-coverage whole-genome human data.")
group_y = parser.add_mutually_exclusive_group(required=False)
group_y.add_argument(
"--y_present", action="store_true", default=False,
help="Overrides sex chr estimation by XYalign and remaps with Y present.")
group_y.add_argument(
"--y_absent", action="store_true", default=False,
help="Overrides sex chr estimation by XY align and remaps with Y absent.")
# Bam Analysis and Characterize Sex Chrom Flags
parser.add_argument(
"--window_size", "-w", default=None,
help="Window size (integer) for sliding window calculations. Default "
"is 50000. Default is None. If set to None, will use targets "
"provided using --target_bed.")
parser.add_argument(
"--target_bed", default=None,
help="Bed file containing targets to use in sliding window analyses "
"instead of a fixed window width. Either this or --window_size needs "
"to be set. Default is None, which will use window size provided "
"with --window_size. If not None, and --window_size is None, analyses "
"will use targets in provided file. Must be typical bed format, "
"0-based indexing, with the first three columns containing "
"the chromosome name, start coordinate, stop coordinate.")
parser.add_argument(
"--exact_depth", action="store_true", default=False,
help="Calculate exact depth within windows, else use much faster "
"approximation. *Currently exact is not implemented*. Default is False.")
parser.add_argument(
"--whole_genome_threshold", action="store_true", default=False,
help="This flag will calculate the depth filter threshold based on "
"all values from across the genome. By default, thresholds are "
"calculated per chromosome.")
parser.add_argument(
"--mapq_cutoff", "-mq", type=int, default=20,
help="Minimum mean mapq threshold for a window to be "
"considered high quality. Default is 20.")
parser.add_argument(
"--min_depth_filter", type=float, default=0.0,
help="Minimum depth threshold for a window to be considered high "
"quality. Calculated as mean depth * min_depth_filter. So, a "
"min_depth_filter of 0.2 would require at least a minimum depth "
"of 2 if the mean depth was 10. Default is 0.0 to consider all windows.")
parser.add_argument(
"--max_depth_filter", type=float, default=10000.0,
help="Maximum depth threshold for a window to be considered high "
"quality. Calculated as mean depth * max_depth_filter. So, a "
"max_depth_filter of 4 would require depths to be less than or "
"equal to 40 if the mean depth was 10. "
"Default is 10000.0 to consider all windows.")
parser.add_argument(
"--num_permutations", type=int, default=10000,
help="Number of permutations to use for permutation analyses. "
"Default is 10000")
parser.add_argument(
"--num_bootstraps", type=int, default=10000,
help="Number of bootstrap replicates to use when bootstrapping mean "
"depth ratios among chromosomes. Default is 10000")
parser.add_argument(
"--ignore_duplicates", action="store_true", default=False,
help="Ignore duplicate reads in bam analyses. Default is to include "
"duplicates.")
# Plotting flags
parser.add_argument(
"--marker_size", type=float, default=10.0,
help="Marker size for genome-wide plots in matplotlib. Default is 10.")
parser.add_argument(
"--marker_transparency", "-mt", type=float, default=0.5,
help="Transparency of markers in genome-wide plots. "
"Alpha in matplotlib. Default is 0.5")
parser.add_argument(
"--coordinate_scale", type=int, default=1000000,
help="For genome-wide scatter plots, divide all coordinates by this value."
"Default is 1000000, which will plot everything in megabases.")
parser.add_argument(
"--include_fixed", type=bool, default=False,
help="Default is False, which removes read balances less than or equal to "
"0.05 and equal to 1.0 for histogram plotting. True will include "
"all values. Extreme values removed by default because they often swamp "
"out the signal of the rest of the distribution.")
# CHROM_STATS flags
parser.add_argument(
"--use_counts", action="store_true", default=False,
help="If True, get counts of reads per chromosome for CHROM_STATS, rather "
"than calculating mean depth and mapq. Much faster, but provides less "
"information. Default is False")
args = parser.parse_args()
# Validate Arguments
mods = [
args.PREPARE_REFERENCE, args.ANALYZE_BAM, args.CHARACTERIZE_SEX_CHROMS,
args.REMAPPING, args.STRIP_READS, args.CHROM_STATS]
if not any(mods) is True:
full_pipeline = True
else:
full_pipeline = False
# Ensure only one of bam, sam, or cram is set
bam_flags = [args.bam, args.cram, args.sam]
bam_flags = [x for x in bam_flags if x is not None]
if len(bam_flags) > 1:
sys.exit(
"Error. --bam, --cram, and --sam are mutally exclusive, please "
"provide no more than one. Submitted values are: --bam {} "
"--cram {} --sam {}".format(args.bam, args.cram, args.sam))
elif len(bam_flags) == 0:
if args.PREPARE_REFERENCE is not True:
sys.exit(
"Error. All modules other than PREPARE_REFERENCE require an input "
"bam file.")
# Cram and SAM files are currently unsupported
if args.cram is not None:
sys.exit(
"Error. XYalign does not currently support cram files. "
"Please provide a bam file via --bam instead.")
if args.sam is not None:
sys.exit(
"Error. XYalign does not currently support sam files. "
"Please provide a bam file via --bam instead.")
# Exact depth is not currently implemented
if args.exact_depth is True:
sys.exit(
"Error. Exact depth is not currently implemented. Exiting.")
# Validate ploidy test arguments
test_flags = [args.no_perm_test, args.no_bootstrap, args.no_ks_test]
if full_pipeline is True:
if args.no_bootstrap is True:
if args.y_present is False and args.y_absent is False:
sys.exit(
"Error. Either --y_present or --y_absent needs to be "
"included with if running the full pipeline "
"with --no_bootstrap. XYalign uses the results of the "
"bootstrap for sex chromosome complement inference.")
if args.REMAPPING is True:
if any([args.y_present, args.y_absent]) is False:
sys.exit(
"Error. Either --y_present or --y_absent needs to be "
"included with if runnning --REMAPPING. If you are "
"interested in both sex chromosome complement inference "
"and remapping based on the results of said inference, "
"consider running the full pipeline.")
# Validate chromosome arguments
if any(
[full_pipeline, args.ANALYZE_BAM, args.CHARACTERIZE_SEX_CHROMS,
args.STRIP_READS, args.CHROM_STATS]) is True:
if args.chromosomes == [None]:
sys.exit("Please provide chromosome names to analyze (--chromosomes)")
if any(
[full_pipeline, args.CHARACTERIZE_SEX_CHROMS]) is True:
if len(args.chromosomes) < 2:
sys.exit(
"You only provided a single chromosome to analyze with "
"--chromosomes. CHARACTERIZE_SEX_CHROMS requires at least two "
"chromosomes, including one designated as an X chromosome.")
if any(
[full_pipeline, args.CHARACTERIZE_SEX_CHROMS, args.REMAPPING]) is True:
if args.x_chromosome == [None]:
sys.exit(
"Please provide an 'X chromosome' to analyze (--x_chromosome). "
"Note that this does not have to actually be an x-linked sequence, "
"but this designation is required for the tests in "
"CHARACTERIZE_SEX_CHROMS, which involve pairwise comparisons "
"between the 'X chromosome', the 'Y chromosome', and all other "
"'autosomes' (see documentation for more details).")
if any(
[full_pipeline, args.PREPARE_REFERENCE, args.REMAPPING]) is True:
if args.y_chromosome == [None]:
sys.exit(
"Please provide an 'Y chromosome' for the full pipeline, "
"--PREPARE_REFERENCE, or --REMAPPING. It is required for the "
"creation and use of an alternate reference (e.g., "
"masking the Y chromosome for the XX reference)")
# Validate bwa arguments
bwa_args = [str(x).strip() for x in args.bwa_flags.split()]
red_list = ["-rm", "rm", "-rf", "rf", "-RM", "RM", "-RF", "RF"]
if any(x in bwa_args for x in red_list):
sys.exit(
"Found either rm or rf in your bwa flags. Exiting to prevent "
"unintended shell consequences")
yellow_list = ["-R", "-t"]
if any(x in bwa_args for x in yellow_list):
sys.exit(
"Found either -R or -t in bwa flags. These flags are already used "
"in XYalign. Please remove.")
# Validate sliding window options
if any(
[full_pipeline, args.ANALYZE_BAM, args.CHARACTERIZE_SEX_CHROMS]) is True:
if args.window_size is not None and args.window_size != "None":
if args.window_size.isdigit() is False:
sys.exit(
"--window_size needs to be either None or a positive integer. "
"Exiting.")
else:
if args.target_bed is None:
sys.exit(
"If --window_size is None, --target_bed needs to be used. "
"Please set one of these two flags if running ANALYZE_BAM, "
"CHARACTERIZE_SEX_CHROMS, or the full pipeline. Exiting.")
elif os.path.exists(args.target_bed) is False:
sys.exit(
"Invalid file provided with --target_bed. Check path. Exiting.")
# Return arguments namespace
return args
[docs]def ref_prep(
ref_obj, ref_mask, ref_dir, xx, xy, y_chromosome,
samtools_path, bwa_path, bwa_index):
"""
Reference prep part of XYalign pipeline.
* Creates two reference fasta files. Both will include masks provied with
ref_mask. One will additionally have the entire Y chromosome
hard masked.
* Indexes (.fai, .dict, and optionally bwa indices) both new references
Parameters
----------
ref_obj : RefFasta() object
A reftools.RefFasta() object of a fasta reference file to be processed
ref_mask : list or None
List of files to use to *hard-mask* references. None will ignore masking.
ref_dir : str
Path to output directory
xx : str
Path to XX output reference
xy : str
Path to XY output reference
y_chromosome : str
Name of Y chromosome in fasta
samtools_path : str
The path to samtools (i.e, "samtools" if in path)
bwa_path : str
The path to bwa (i.e, "bwa" if in path)
bwa_index : bool
If True, create bwa indices. Don't if False.
Returns
-------
tuple
Paths to two masked references (y_masked, y_unmasked)
"""
logger = logging.getLogger("xyalign")
# Combine masks, if more than one present
if ref_mask is not None:
if len(ref_mask) > 1:
reference_mask = utils.merge_bed_files(
"{}/reference_mask.merged.bed".format(
ref_dir), *ref_mask)
else:
reference_mask = ref_mask[0]
else:
reference_mask = None
# Create masked noY reference
y_mask = ref_obj.chromosome_bed("{}/Y.bed".format(
ref_dir), y_chromosome)
if reference_mask is not None:
noy_out = ref_obj.mask_reference(
utils.merge_bed_files(
"{}/reference_mask.maskY.merged.bed".format(
ref_dir), reference_mask, y_mask), xx)
else:
noy_out = ref_obj.mask_reference(y_mask, xx)
noy_ref = reftools.RefFasta(noy_out, samtools_path, bwa_path)
noy_ref.seq_dict()
# Create masked withY reference
if reference_mask is not None:
withy_out = ref_obj.mask_reference(reference_mask, xy)
else:
logger.info(
"No reference mask provided, copying full reference to {} as "
"XY reference to prevent damage, modification, etc. to original "
"reference.".format(xy))
subprocess.call(["cp", ref_obj.filepath, xy])
withy_out = xy
withy_ref = reftools.RefFasta(withy_out, samtools_path, bwa_path)
withy_ref.seq_dict()
if bwa_index is True:
noy_ref.index_bwa()
withy_ref.index_bwa()
return (noy_ref, withy_ref)
[docs]def chrom_stats(bam_obj_list, chrom_list, use_counts):
"""
Runs chrom stats module.
Calculates mean depth and mapq across entire scaffolds for a list of bam
files
Input
-----
bam_obj_list : list
List of bam.BamFile() objects. Need to have been created using same
reference (i.e., seq names and lengths are the same)
chrom_list : list
List of chromosome names to analyze
use_counts : bool
If True, will just grab counts from bam index INSTEAD of traversing
for depth and mapq
Returns
-------
tuple
Tuple containing two dictionaries with results for
depth and mapq, respectively. Or, if use_counts is True, returns a
tuple containing the count dictionary and None.
"""
logger = logging.getLogger("xyalign")
logger.info(
"Running CHROM_STATS on following bam files: {}".format(
", ".join([x.filepath for x in bam_obj_list])))
comp_check = utils.check_compatibility_bam_list(bam_obj_list)
if comp_check is False:
logger.error(
"Bam files incompatible - ensure same reference used for all files "
"and that sequence portion of all bam headers match. Exiting")
logging.shutdown()
sys.exit(1)
if chrom_list == ["ALL"] or chrom_list == ["all"]:
chrom_list = list(bam_obj_list[0].chromosome_names())
missing_chroms = bam_obj_list[0].check_chrom_in_bam(chrom_list)
if len(missing_chroms) != 0:
logger.error(
"One or more chromosomes provided via --chromosomes not "
"present in bam files. Exiting.")
logging.shutdown()
sys.exit(1)
if use_counts is False:
chrom_depth_dict = collections.OrderedDict()
chrom_mapq_dict = collections.OrderedDict()
chrom_depth_dict["header"] = [
os.path.basename(x.filepath) for x in bam_obj_list]
chrom_depth_dict["header"].insert(0, "chrom")
chrom_mapq_dict["header"] = [
os.path.basename(x.filepath) for x in bam_obj_list]
chrom_mapq_dict["header"].insert(0, "chrom")
for chromosome in chrom_list:
chrom_results = [
x.chrom_stats(chromosome, False) for x in bam_obj_list]
chrom_depth_dict[chromosome], chrom_mapq_dict[chromosome] = zip(
*chrom_results)
chrom_depth_dict[chromosome] = (chromosome,) + chrom_depth_dict[chromosome]
chrom_mapq_dict[chromosome] = (chromosome,) + chrom_mapq_dict[chromosome]
return (chrom_depth_dict, chrom_mapq_dict)
else:
chrom_count_dict = collections.OrderedDict()
chrom_count_dict["header"] = [
os.path.basename(x.filepath) for x in bam_obj_list]
chrom_count_dict["header"].insert(0, "chrom")
for chromosome in chrom_list:
chrom_count_dict[chromosome] = [chromosome]
for i in bam_obj_list:
idx_stats = i.chrom_counts()
for k in idx_stats:
if k.contig in chrom_count_dict:
chrom_count_dict[k.contig].append(int(k.mapped))
return (chrom_count_dict, None)
[docs]def bam_analysis(
input_bam_obj, platypus_calling, platypus_path, vcf_log, ref_obj,
input_chroms, cpus, out_vcf, no_variant_plots, window_size, target_bed,
sample_id, readbalance_prefix, variant_site_quality,
variant_genotype_quality, variant_depth, marker_size, marker_transparency,
homogenize_read_balance, data_frame_readbalance, min_variant_count,
no_bam_analysis, ignore_duplicates, exact_depth, whole_genome_threshold,
mapq_cutoff, min_depth_filter, max_depth_filter, depth_mapq_prefix,
bam_data_frame, output_bed_high, output_bed_low, use_bed_for_platypus,
coordinate_scale, fixed):
"""
Runs bam analyis part of XYalign pipeline on bam file.
* (Optionally) calls variants using Platypus
* (Optionally) parses and filters Platypus vcf, and plots read balance
* (Optionally) Calculates window based metrics from the bam file:
depth and mapq
* (optionally) Plots window-based metrics
* Outputs two bed files: high quality windows, and low quality windows.
Parameters
----------
input_bam_obj : bam.BamFile() object
platypus_calling : bool
If True, will call and analyze variants
platypus_path : str
Command to call platypus (e.g, "platypus")
vcf_log : str
Path to file for platypus log
ref_obj : reftools.RefFasta() object
input_chroms : list
Chromosomes to analyze
cpus : int
Number of threads/cpus
out_vcf : str
Output vcf path/name
no_variant_plots : bool
If True, will not plot read balance
window_size : int or None
Window size for sliding window analyses (both bam and vcf). If None,
will use regions in target_bed
target_bed : str or None
Path to bed file containing targets to use in sliding window analyses
sample_id : str
readbalance_prefix : str
Prefix, including full path, to use for output files for
readbalance analyses
variant_site_quality : int
Minimum site quality (PHRED) for a site to be included in
readbalance analyses
variant_genotype_quality : int
Minimum genotype quality for a site to be included in read balance analyses
variant_depth : int
Minimum depth for a site to be included in read balance analyses
marker_size : float
Marker size for plotting genome scatter plots
marker_transparency: float
Value to use for marker transparency in genome scatter plots
homogenize_read_balance : bool
If true, will subtract values less than 0.5 from 1. I.e., 0.25 and 0.75
would be treated equivalently
data_frame_readbalance: str
Path of output file for full read balance dataframe
min_variant_count : int
Minimum number of variants in a given window for the window to be
plotted in window-based read balance analyses
no_bam_analysis : bool
If True, no bam analyses will take place
ignore_duplicates : bool
If True, duplicates excluded from bam analyses
exact_depth : bool
If True, exact depth calculated in each window. Else, a much faster
approximation will be used
whole_genome_threshold : bool
If True, values for depth filters will be calculated using mean from
across all chromosomes included in analyses. Else, mean will be taken
per chromosome
min_depth_filter : float
Minimum depth threshold for a window to be considered high. Calculated
as mean depth * min_depth_filter.
max_depth_filter : float
Maximum depth threshold for a window to be considered high. Calculated
as mean depth * min_depth_filter.
depth_mapq_prefix : str
Prefix, including full path, to be used for files output from depth and
mapq analyses
bam_data_frame : str
Full path to output file for dataframe containing all data from bam
analyses
output_bed_high : str
Full path to output bed containing high quality (i.e., passing
filters) windows
output_bed_low : str
Full path to output bed containing low quality (i.e., failing
filters) windows
use_bed_for_platypus : bool
If True, use output_bed_high as regions for Platypus calling
coordinate_scale : int
Divide all coordinates by this value for plotting. In most cases, 1000000
will be ideal for eukaryotic genomes.
fixed : bool
If False, only plots histogram for values between 0.05 and 1.0
(non-inclusive). If True, plots histogram of all variants.
Returns
-------
tuple
(list of pandas dataframes with passing windows,
list of pandas dataframes with failing windows)
"""
logger = logging.getLogger("xyalign")
# Depth/MAPQ
if no_bam_analysis is not True:
all_df = []
pass_df = []
fail_df = []
for chromosome in input_chroms:
if window_size is not None and window_size != "None":
data = input_bam_obj.analyze_bam(
chromosome, ignore_duplicates,
exact_depth, int(window_size))
else:
data = input_bam_obj.analyze_bam(
chromosome, ignore_duplicates,
exact_depth, None, target_bed)
if whole_genome_threshold is True:
tup = utils.make_region_lists_genome_filters(
data, mapq_cutoff,
min_depth_filter, max_depth_filter)
else:
tup = utils.make_region_lists_chromosome_filters(
data, mapq_cutoff,
min_depth_filter, max_depth_filter)
all_df.append(data)
pass_df.append(tup[0])
fail_df.append(tup[1])
utils.plot_depth_mapq(
data, depth_mapq_prefix, sample_id,
input_bam_obj.get_chrom_length(chromosome), marker_size,
marker_transparency, coordinate_scale)
all_concat = pd.concat(all_df)
all_concat.to_csv(
bam_data_frame, index=False, sep="\t", quoting=csv.QUOTE_NONE)
utils.output_bed(output_bed_high, *pass_df)
utils.output_bed(output_bed_low, *fail_df)
# Platypus
if platypus_calling is True:
if use_bed_for_platypus is True:
a = input_bam_obj.platypus_caller(
platypus_path, vcf_log,
ref_obj.filepath, input_chroms, cpus, out_vcf,
output_bed_high)
else:
a = input_bam_obj.platypus_caller(
platypus_path, vcf_log,
ref_obj.filepath, input_chroms, cpus, out_vcf,
None)
if a != 0:
logger.error("Error in platypus calling on {}".format(
input_bam_obj.filepath))
logging.shutdown()
sys.exit(1)
noprocess_vcf_object = variants.VCFFile(out_vcf)
if no_variant_plots is not True:
if window_size is not None and window_size != "None":
noprocess_vcf_object.plot_variants_per_chrom(
chrom_list=input_chroms,
sampleID=sample_id,
output_prefix=readbalance_prefix,
site_qual=variant_site_quality,
genotype_qual=variant_genotype_quality,
depth=variant_depth,
MarkerSize=marker_size,
MarkerAlpha=marker_transparency,
bamfile_obj=input_bam_obj,
variant_caller="platypus",
homogenize=homogenize_read_balance,
dataframe_out=data_frame_readbalance,
min_count=min_variant_count,
window_size=int(window_size),
x_scale=coordinate_scale,
target_file=None,
include_fixed=fixed)
else:
noprocess_vcf_object.plot_variants_per_chrom(
chrom_list=input_chroms,
sampleID=sample_id,
output_prefix=readbalance_prefix,
site_qual=variant_site_quality,
genotype_qual=variant_genotype_quality,
depth=variant_depth,
MarkerSize=marker_size,
MarkerAlpha=marker_transparency,
bamfile_obj=input_bam_obj,
variant_caller="platypus",
homogenize=homogenize_read_balance,
dataframe_out=data_frame_readbalance,
min_count=min_variant_count,
window_size=None,
x_scale=coordinate_scale,
target_file=target_bed,
include_fixed=fixed)
return(pass_df, fail_df)
[docs]def ploidy_analysis(
passing_df, failing_df, no_perm_test, no_ks_test, no_bootstrap,
input_chroms, x_chromosome, y_chromosome, results_dir,
num_permutations, num_bootstraps, sample_id):
"""
Runs the ploidy analysis part of XYalign.
* Runs permutation test to systematically compare means between
every possible pair of chromosomes
* Runs K-S two sample test to systematically compare distributions between
every possible pair of chromosomes
* Bootstraps the mean depth ratio for every possible pair of chromosomes
Parameters
----------
passing_df : list
Passing pandas dataframes, one per chromosome
failing_df : list
Failing pandas dataframes, one per chromosome
no_perm_test : bool
If False, permutation test will be run
no_ks_test : bool
If False, KS test will be run
no_bootstrap : bool
If False, bootstrap analysis will be run
input_chroms : list
Chromosomes/scaffolds to analyze
x_chromosome : list
X-linked scaffolds
y_chromosome : list
Y-likned scaffolds
results_dir : str
Full path to directory to output results
num_permutations : int
Number of permutations
num_bootstraps : int
Number of bootstrap replicates
sample_id : str
Returns
-------
dictionary
Results for each test. Keys: perm, ks, boot.
"""
# Permutations
if no_perm_test is False:
if y_chromosome != [None]:
sex_chromosomes = x_chromosome + y_chromosome
perm_res_x = []
perm_res_y = []
else:
sex_chromosomes = x_chromosome
perm_res_x = []
perm_res_y = None
autosomes = [
x for x in input_chroms if x not in sex_chromosomes]
for c in autosomes:
perm_res_x.append(ploidy.permutation_test_chromosomes(
pd.concat(passing_df), c,
str(x_chromosome[0]), "chrom",
"depth", num_permutations,
results_dir + "/{}.{}_{}_permutation_results.txt".format(
sample_id, c, str(x_chromosome[0]))))
if perm_res_y is not None:
perm_res_y.append(ploidy.permutation_test_chromosomes(
pd.concat(passing_df), c,
str(y_chromosome[0]), "chrom",
"depth", num_permutations,
results_dir + "/{}.{}_{}_permutation_results.txt".format(
sample_id, c, str(y_chromosome[0]))))
if perm_res_y is not None:
sex_perm_res = ploidy.permutation_test_chromosomes(
pd.concat(passing_df), str(x_chromosome[0]),
str(y_chromosome[0]),
"chrom", "depth", num_permutations,
results_dir + "/{}.{}_{}_permutation_results.txt".format(
sample_id, str(x_chromosome[0]), str(y_chromosome[0])))
else:
sex_perm_res = None
# K-S Two Sample
if no_ks_test is False:
if y_chromosome != [None]:
sex_chromosomes = x_chromosome + y_chromosome
ks_res_x = []
ks_res_y = []
else:
sex_chromosomes = x_chromosome
ks_res_x = []
ks_res_y = None
autosomes = [
x for x in input_chroms if x not in sex_chromosomes]
for c in autosomes:
ks_res_x.append(ploidy.ks_two_sample(
pd.concat(passing_df), c,
str(x_chromosome[0]), "chrom", "depth",
results_dir + "/{}.{}_{}_ks_results.txt".format(
sample_id, c, str(x_chromosome[0]))))
if ks_res_y is not None:
ks_res_y.append(ploidy.ks_two_sample(
pd.concat(passing_df), c,
str(y_chromosome[0]), "chrom", "depth",
results_dir + "/{}.{}_{}_ks_results.txt".format(
sample_id, c, str(y_chromosome[0]))))
if ks_res_y is not None:
sex_ks_res = ploidy.ks_two_sample(
pd.concat(passing_df), str(x_chromosome[0]),
str(y_chromosome[0]), "chrom", "depth",
results_dir + "/{}.{}_{}_ks_results.txt".format(
sample_id, str(x_chromosome[0]), str(y_chromosome[0])))
else:
sex_ks_res = None
# Bootstrap
if no_bootstrap is False:
if y_chromosome != [None]:
sex_chromosomes = x_chromosome + y_chromosome
boot_res_x = []
boot_res_y = []
else:
sex_chromosomes = x_chromosome
boot_res_x = []
boot_res_y = None
autosomes = [
x for x in input_chroms if x not in sex_chromosomes]
for c in autosomes:
boot_res_x.append(ploidy.bootstrap(
pd.concat(passing_df), c,
str(x_chromosome[0]), "chrom",
"depth", num_bootstraps,
results_dir + "/{}.{}_{}_bootstrap_results.txt".format(
sample_id, c, str(x_chromosome[0]))))
if boot_res_y is not None:
boot_res_y.append(ploidy.bootstrap(
pd.concat(passing_df), c,
str(y_chromosome[0]), "chrom",
"depth", num_bootstraps,
results_dir + "/{}.{}_{}_bootstrap_results.txt".format(
sample_id, c, str(y_chromosome[0]))))
if boot_res_y is not None:
sex_boot_res = ploidy.bootstrap(
pd.concat(passing_df), str(x_chromosome[0]),
str(y_chromosome[0]),
"chrom", "depth", num_bootstraps,
results_dir + "/{}.{}_{}_bootstrap_results.txt".format(
sample_id, str(x_chromosome[0]), str(y_chromosome[0])))
else:
sex_boot_res = None
return {
"perm": [perm_res_x, perm_res_y, sex_perm_res],
"ks": [ks_res_x, ks_res_y, sex_ks_res],
"boot": [boot_res_x, boot_res_y, sex_boot_res]}
[docs]def remapping(
input_bam_obj, y_pres, masked_references, samtools_path, sambamba_path,
repairsh_path, shufflesh_path, bwa_path, bwa_flags, single_end, bam_dir,
fastq_dir, sample_id, x_chromosome, y_chromosome, cpus,
xmx, fastq_compression, cleanup, read_group_id):
"""
Runs remapping steps of XYalign.
* Strips, sorts, and re-pair reads from the sex chromosomes (collecting read
group information)
* Maps (with sorting) reads (with read group information) to appropriate
reference based on presence (or not) of Y chromosome
* Merge bam files (if more than one read group)
Parameters
----------
input_bam_obj : bam.BamFile() object
y_pres : bool
True if Y chromosome present in individual
masked_references : tuple
Masked reference objects (xx, xy)
samtools_path : str
Path/command to call samtools
sambamba_path : str
Path/command to call sambamba
repairsh_path : str
Path/command to call repair.sh
shufflesh_path : str
Path/command to call shuffle.sh
bwa_path : str
Path/command to call bwa
bwa_flags : str
Flags to use for bwa mapping
single_end : bool
If True, reads treated as single end
bam_dir : str
Path to output directory for bam files
fastq_dir : str
Path to output directory for fastq files
sample_id : str
x_chromosome : list
X-linked scaffolds
y_chromosome : list
Y-linked scaffolds
cpus : int
Number of threads/cpus
xmx : str
Value to be combined with -Xmx for java programs (i.e., 4g would
result in -Xmx4g)
fastq_compression : int
Compression level for fastq files. 0 leaves fastq files uncompressed.
Otherwise values should be between 1 and 9 (inclusive), with
larger values indicating more compression
cleanup : bool
If True, will delete temporary files
read_group_id : str
ID to use to add read group information
Returns
-------
str
Path to bam containing remapped sex chromsomes
"""
if y_pres is True:
new_reference = masked_references[1]
else:
new_reference = masked_references[0]
new_fastqs = input_bam_obj.strip_reads(
repairsh_path, shufflesh_path, single_end, fastq_dir,
sample_id, x_chromosome + y_chromosome, xmx,
fastq_compression, cleanup, read_group_id)
with open(new_fastqs[0]) as f:
read_group_and_fastqs = [line.strip() for line in f]
read_group_and_fastqs = [x.split() for x in read_group_and_fastqs]
if new_fastqs[1] is not None:
with open(new_fastqs[1]) as f:
read_group_headers = [line.split() for line in f]
temp_bam_list = []
for i in read_group_and_fastqs:
if i != [""]:
rg_id = i[0]
fastq_files = i[1:]
for j in read_group_headers:
for k in j:
if k[0:2] == 'ID':
if k[3:] == rg_id:
rg_tag = "\t".join(j)
break
temp_bam = assemble.bwa_mem_mapping_sambamba(
bwa_path, samtools_path, sambamba_path,
new_reference, "{}/{}.sex_chroms.{}".format(
bam_dir, sample_id, rg_id),
fastq_files, cpus, rg_tag,
[str(x).strip() for x in bwa_flags.split()])
temp_bam_list.append(temp_bam)
if len(temp_bam_list) < 2:
new_bam = temp_bam_list[0]
else:
new_bam = bam.samtools_merge(
samtools_path, temp_bam_list, "{}/{}.sex_chroms".format(
bam_dir, sample_id), cpus)
else:
fastq_files = read_group_and_fastqs[0][1:]
new_bam = assemble.bwa_mem_mapping_sambamba(
bwa_path, samtools_path, sambamba_path,
new_reference, "{}/{}.sex_chroms.{}".format(
bam_dir, sample_id, rg_id),
fastq_files, cpus, "None",
[str(x).strip() for x in bwa_flags.split()])
return new_bam
[docs]def swap_sex_chroms(
input_bam_obj, new_bam_obj, samtools_path, sambamba_path, x_chromosome,
y_chromosome, bam_dir, sample_id, cpus, xyalign_params):
"""
Switches sex chromosmes from new_bam_file with those in original bam file
Parameters
----------
input_bam_obj : bam.BamFile() object
Original input bam file object
new_bam_obj : bam.BamFile() object
Bam file object containing newly mapped sex chromosomes (to insert)
samtools_path : str
Path/command to call samtools
sambamba_path : str
Path/command to call sambamba
x_chromosome : list
X-linked scaffolds
y_chromosome : str
Y-linked scaffolds
bam_dir : str
Path to bam output directory
sample_id : str
cpus : int
Number of threads/cpus
xyalign_params : dict
Dictionary of xyalign_params to add to bam header
Returns
-------
str
Path to new bam file containing original autosomes and new sex chromosomes
"""
merged_bam = bam.switch_sex_chromosomes_sambamba(
samtools_path, sambamba_path, input_bam_obj.filepath,
new_bam_obj.filepath, x_chromosome + y_chromosome,
bam_dir, sample_id, cpus, xyalign_params)
return merged_bam
[docs]def main():
# Version - placeholder for now - need to incorporate it into __init__.py
citation = """
XYalign: Inferring Sex Chromosome Ploidy in NGS Data
Timothy H Webster, Tanya Phung, Madeline Couse, Bruno Grande, Eric Karlins,
Phillip Richmond, Whitney Whitford, Melissa A. Wilson Sayres
2017
Version: {}
""".format(__version__)
# Start timer
xyalign_start = time.time()
# Grab arguments
args = parse_args()
# Create directory structure if not already in place
utils.validate_dir(args.output_dir, "fastq")
utils.validate_dir(args.output_dir, "bam")
utils.validate_dir(args.output_dir, "reference")
utils.validate_dir(args.output_dir, "bed")
utils.validate_dir(args.output_dir, "vcf")
utils.validate_dir(args.output_dir, "plots")
utils.validate_dir(args.output_dir, "results")
utils.validate_dir(args.output_dir, "logfiles")
# Set up logfile
logfile_path = os.path.join(args.output_dir, "logfiles")
if args.logfile is not None:
logfile = os.path.join(
logfile_path, args.logfile)
else:
logfile = os.path.join(
logfile_path, "{}_xyalign.log".format(
args.sample_id))
# Initiate logging
logger = logging.getLogger("xyalign")
logger.setLevel(logging.DEBUG)
# Create File Handler
fh = logging.FileHandler(logfile, mode="w")
fh.setLevel(logging.DEBUG)
# Create Console Handler
ch = logging.StreamHandler()
if args.reporting_level == "DEBUG":
ch.setLevel(logging.DEBUG)
elif args.reporting_level == "INFO":
ch.setLevel(logging.INFO)
elif args.reporting_level == "ERROR":
ch.setLevel(logging.ERROR)
elif args.reporting_level == "CRITICAL":
ch.setLevel(logging.CRITICAL)
else:
ch.setLevel(logging.INFO)
# Set Formatter
formatter = logging.Formatter(
'%(asctime)s - %(name)s - %(levelname)s - %(message)s')
fh.setFormatter(formatter)
ch.setFormatter(formatter)
# Add handlers to logger
logger.addHandler(fh)
logger.addHandler(ch)
# Log citation
logger.info("{}".format(citation))
# Set up param dictionary
xyalign_params_dict = {'ID': 'XYalign', 'VN': __version__, 'CL': []}
p = ""
for arg in args.__dict__:
p = p + "{}={}, ".format(arg, args.__dict__[arg])
xyalign_params_dict['CL'].append("{}={}".format(arg, args.__dict__[arg]))
# Log parameters and pipeline start
logger.info("Parameters: {}".format(p))
logger.info("Beginning XYalign")
# Check for external programs
logger.info("Checking external programs")
utils.validate_external_prog(args.repairsh_path, "bbmap's repair.sh")
utils.validate_external_prog(args.bedtools_path, "bedtools")
utils.validate_external_prog(args.bwa_path, "bwa")
utils.validate_external_prog(args.platypus_path, "platypus")
utils.validate_external_prog(args.samtools_path, "samtools")
utils.validate_external_prog(args.sambamba_path, "sambamba")
logger.info("All external program paths successfully checked")
# Setup output paths
fastq_path = os.path.join(args.output_dir, "fastq")
bam_path = os.path.join(args.output_dir, "bam")
reference_path = os.path.join(args.output_dir, "reference")
bed_path = os.path.join(args.output_dir, "bed")
vcf_path = os.path.join(args.output_dir, "vcf")
plots_path = os.path.join(args.output_dir, "plots")
results_path = os.path.join(args.output_dir, "results")
# Create paths for output files
# reference-related
if args.xx_ref_out_name is not None:
xx_out = os.path.join(reference_path, args.xx_ref_out_name)
elif args.xx_ref_out is not None:
xx_out = args.xx_ref_out
else:
xx_out = os.path.join(reference_path, "xyalign_noY.masked.fa")
if args.xy_ref_out_name is not None:
xy_out = os.path.join(reference_path, args.xy_ref_out_name)
elif args.xy_ref_out is not None:
xy_out = args.xy_ref_out
else:
xy_out = os.path.join(reference_path, "xyalign_withY.masked.fa")
# variant/vcf related
noprocessing_vcf = os.path.join(
vcf_path, "{}.noprocessing.vcf".format(
args.sample_id))
postprocessing_vcf = os.path.join(
vcf_path, "{}.postprocessing.vcf".format(
args.sample_id))
if args.platypus_logfile is not None:
plat_log = args.platypus_logfile
else:
plat_log = args.sample_id
noprocessing_vcf_log = os.path.join(
logfile_path, "{}_noprocessing_platypus.log".format(
plat_log))
postprocessing_vcf_log = os.path.join(
logfile_path, "{}_postprocessing_platypus.log".format(
plat_log))
readbalance_prefix_noprocessing = os.path.join(
plots_path, "{}_noprocessing".format(args.sample_id))
readbalance_prefix_postprocessing = os.path.join(
plots_path, "{}_postprocessing".format(args.sample_id))
# Depth/mapq related
chrom_stats_mapq = os.path.join(
results_path, "{}_chrom_stats_mapq.txt".format(args.sample_id))
chrom_stats_depth = os.path.join(
results_path, "{}_chrom_stats_depth.txt".format(args.sample_id))
chrom_stats_count = os.path.join(
results_path, "{}_chrom_stats_count.txt".format(args.sample_id))
depth_mapq_prefix_noprocessing = os.path.join(
plots_path, "{}_noprocessing".format(args.sample_id))
depth_mapq_prefix_postprocessing = os.path.join(
plots_path, "{}_postprocessing".format(args.sample_id))
data_frame_preprocessing = os.path.join(
bed_path, "{}_full_dataframe_depth_mapq_preprocessing.csv".format(
args.sample_id))
data_frame_postprocessing = os.path.join(
bed_path, "{}_full_dataframe_depth_mapq_postprocessing.csv".format(
args.sample_id))
data_frame_readbalance_preprocessing = os.path.join(
bed_path, "{}_full_dataframe_readbalance_preprocessing.csv".format(
args.sample_id))
data_frame_readbalance_postprocessing = os.path.join(
bed_path, "{}_full_dataframe_readbalance_postprocessing.csv".format(
args.sample_id))
high_prefix = "{}_highquality_preprocessing".format(args.sample_id)
output_bed_high_preprocessing = os.path.join(
bed_path, "{}.bed".format(high_prefix))
low_prefix = "{}_lowquality_preprocessing".format(args.sample_id)
output_bed_low_preprocessing = os.path.join(
bed_path, "{}.bed".format(low_prefix))
high_prefix_postprocessing = "{}_highquality_postprocessing".format(
args.sample_id)
output_bed_high_postprocessing = os.path.join(
bed_path, "{}.bed".format(high_prefix_postprocessing))
low_prefix_postprocessing = "{}_lowquality_postprocessing".format(
args.sample_id)
output_bed_low_postprocessing = os.path.join(
bed_path, "{}.bed".format(low_prefix_postprocessing))
# Set cleanup
if args.no_cleanup is True:
cleanup = False
else:
cleanup = True
######################################
# Run XYalign #
######################################
# Reference Prep Only
if args.PREPARE_REFERENCE is True:
logger.info(
"PREPARE_REFERENCE set, so only preparing reference fastas.")
ref = reftools.RefFasta(args.ref, args.samtools_path, args.bwa_path)
ref_prep(
ref_obj=ref,
ref_mask=args.reference_mask,
ref_dir=reference_path,
xx=xx_out,
xy=xy_out,
y_chromosome=args.y_chromosome,
samtools_path=args.samtools_path,
bwa_path=args.bwa_path,
bwa_index=args.bwa_index)
logger.info("PREPARE_REFERENCE complete.")
logger.info("XYalign complete. Elapsed time: {} seconds".format(
time.time() - xyalign_start))
logging.shutdown()
sys.exit(0)
if args.CHROM_STATS is True:
logger.info(
"CHROM_STATS set, will iterate through bam files to calculate "
"chromosome-wide averages.")
bam_list = [bam.BamFile(x, args.samtools_path) for x in args.bam]
chrom_stats_results = chrom_stats(
bam_obj_list=bam_list,
chrom_list=args.chromosomes,
use_counts=args.use_counts)
if args.use_counts is False:
cs_depth_dict = chrom_stats_results[0]
cs_mapq_dict = chrom_stats_results[1]
with open(chrom_stats_depth, "w") as f:
for i in cs_depth_dict:
out_line = "\t".join([str(x) for x in cs_depth_dict[i]])
f.write("{}\n".format(out_line))
with open(chrom_stats_mapq, "w") as f:
for i in cs_mapq_dict:
out_line = "\t".join([str(x) for x in cs_mapq_dict[i]])
f.write("{}\n".format(out_line))
else:
cs_count_dict = chrom_stats_results[0]
with open(chrom_stats_count, "w") as f:
for i in cs_count_dict:
out_line = "\t".join([str(x) for x in cs_count_dict[i]])
f.write("{}\n".format(out_line))
logger.info("CHROM_STATS complete.")
logger.info("XYalign complete. Elapsed time: {} seconds".format(
time.time() - xyalign_start))
logging.shutdown()
sys.exit(0)
input_bam = bam.BamFile(args.bam[0], args.samtools_path)
if args.chromosomes == ["ALL"] or args.chromosomes == ["all"]:
input_chromosomes = list(input_bam.chromosome_names())
else:
input_chromosomes = args.chromosomes
if any(
[args.ANALYZE_BAM, args.CHARACTERIZE_SEX_CHROMS, args.STRIP_READS]) is True:
missing_chroms = input_bam.check_chrom_in_bam(input_chromosomes)
if len(missing_chroms) != 0:
logger.error(
"One or more chromosomes provided via --chromosomes not "
"present in bam file. Exiting.")
logging.shutdown()
sys.exit(1)
# Strip reads only
# This module is isolated first because it does not require a reference fasta
if args.STRIP_READS is True:
logger.info(
"STRIP_READS set, so only stripping reads from {}.".format(
input_bam.filepath))
if args.chromosomes == ["ALL"] or args.chromosomes == ["all"]:
stripped_fastqs = input_bam.strip_reads(
args.repairsh_path, args.shufflesh_path, args.single_end,
fastq_path, args.sample_id,
[], args.xmx, args.fastq_compression, cleanup, args.read_group_id)
else:
stripped_fastqs = input_bam.strip_reads(
args.repairsh_path, args.shufflesh_path, args.single_end,
fastq_path, args.sample_id,
input_chromosomes, args.xmx, args.fastq_compression, cleanup,
args.read_group_id)
logger.info("STRIP_READS complete. Output in {}".format(fastq_path))
logger.info("XYalign complete. Elapsed time: {} seconds".format(
time.time() - xyalign_start))
logging.shutdown()
sys.exit(0)
# Other modules
ref = reftools.RefFasta(args.ref, args.samtools_path, args.bwa_path)
# Check to ensure bam and fasta are compatible and imports work
if args.skip_compatibility_check is False:
compatible = utils.check_bam_fasta_compatibility(input_bam, ref)
if compatible is False:
logger.error(
"Exiting due to compatibility issues between {} and {}. Check: "
"1) that this fasta was used when generating this bam, 2) that "
"sequence lengths and names are identical between the two files. "
"You can check 2) by comparing the bam header (e.g., "
"samtools -H {}) with a sequence dictionary (.dict) created for "
"{}. If you think this is an error, you can use the flag "
"--skip_compatibility_check.".format(
input_bam.filepath, ref.filepath, input_bam.filepath,
ref.filepath))
logging.shutdown()
sys.exit(1)
# Stats Only
if args.ANALYZE_BAM is True:
logger.info(
"ANALYZE_BAM set, so only running steps required "
"for bam analysis.")
if args.platypus_calling == "both" or args.platypus_calling == "before":
call_variants = True
else:
call_variants = False
bam_analysis(
input_bam_obj=input_bam,
platypus_calling=call_variants,
platypus_path=args.platypus_path,
vcf_log=noprocessing_vcf_log,
ref_obj=ref,
input_chroms=input_chromosomes,
cpus=args.cpus,
out_vcf=noprocessing_vcf,
no_variant_plots=args.no_variant_plots,
window_size=args.window_size,
target_bed=args.target_bed,
sample_id=args.sample_id,
readbalance_prefix=readbalance_prefix_noprocessing,
variant_site_quality=args.variant_site_quality,
variant_genotype_quality=args.variant_genotype_quality,
variant_depth=args.variant_depth,
marker_size=args.marker_size,
marker_transparency=args.marker_transparency,
homogenize_read_balance=args.homogenize_read_balance,
data_frame_readbalance=data_frame_readbalance_preprocessing,
min_variant_count=args.min_variant_count,
no_bam_analysis=args.no_bam_analysis,
ignore_duplicates=args.ignore_duplicates,
exact_depth=args.exact_depth,
whole_genome_threshold=args.whole_genome_threshold,
mapq_cutoff=args.mapq_cutoff,
min_depth_filter=args.min_depth_filter,
max_depth_filter=args.max_depth_filter,
depth_mapq_prefix=depth_mapq_prefix_noprocessing,
bam_data_frame=data_frame_preprocessing,
output_bed_high=output_bed_high_preprocessing,
output_bed_low=output_bed_low_preprocessing,
use_bed_for_platypus=False,
coordinate_scale=args.coordinate_scale,
fixed=args.include_fixed)
logger.info("ANALYZE_BAM complete.")
logger.info("XYalign complete. Elapsed time: {} seconds".format(
time.time() - xyalign_start))
logging.shutdown()
sys.exit(0)
# Ploidy Estimation Only
elif args.CHARACTERIZE_SEX_CHROMS is True:
logger.info(
"CHARACTERIZE_SEX_CHROMS set, so only running steps required "
"for to characterize sex chromosome complement. Note that "
"this involve both playtpus calling and bam analysis too.")
if args.platypus_calling == "both" or args.platypus_calling == "before":
call_variants = True
else:
call_variants = False
bam_dfs = bam_analysis(
input_bam_obj=input_bam,
platypus_calling=call_variants,
platypus_path=args.platypus_path,
vcf_log=noprocessing_vcf_log,
ref_obj=ref,
input_chroms=input_chromosomes,
cpus=args.cpus,
out_vcf=noprocessing_vcf,
no_variant_plots=args.no_variant_plots,
window_size=args.window_size,
target_bed=args.target_bed,
sample_id=args.sample_id,
readbalance_prefix=readbalance_prefix_noprocessing,
variant_site_quality=args.variant_site_quality,
variant_genotype_quality=args.variant_genotype_quality,
variant_depth=args.variant_depth,
marker_size=args.marker_size,
marker_transparency=args.marker_transparency,
homogenize_read_balance=args.homogenize_read_balance,
data_frame_readbalance=data_frame_readbalance_preprocessing,
min_variant_count=args.min_variant_count,
no_bam_analysis=args.no_bam_analysis,
ignore_duplicates=args.ignore_duplicates,
exact_depth=args.exact_depth,
whole_genome_threshold=args.whole_genome_threshold,
mapq_cutoff=args.mapq_cutoff,
min_depth_filter=args.min_depth_filter,
max_depth_filter=args.max_depth_filter,
depth_mapq_prefix=depth_mapq_prefix_noprocessing,
bam_data_frame=data_frame_preprocessing,
output_bed_high=output_bed_high_preprocessing,
output_bed_low=output_bed_low_preprocessing,
use_bed_for_platypus=False,
coordinate_scale=args.coordinate_scale,
fixed=args.include_fixed)
ploidy_results = ploidy_analysis(
passing_df=bam_dfs[0],
failing_df=bam_dfs[1],
no_perm_test=args.no_perm_test,
no_ks_test=args.no_ks_test,
no_bootstrap=args.no_bootstrap,
input_chroms=input_chromosomes,
x_chromosome=args.x_chromosome,
y_chromosome=args.y_chromosome,
results_dir=results_path,
num_permutations=args.num_permutations,
num_bootstraps=args.num_bootstraps,
sample_id=args.sample_id)
logger.info("CHARACTERIZE_SEX_CHROMS complete.")
logger.info("XYalign complete. Elapsed time: {} seconds".format(
time.time() - xyalign_start))
logging.shutdown()
sys.exit(0)
# Remapping Only
elif args.REMAPPING is True:
logger.info(
"REMAPPING set, so only running steps required for remapping. "
"--y_present or --y_absent must be set")
if args.y_present is True:
y_present = True
logger.info("--y_present provided, so remapping for XY individual")
elif args.y_absent is True:
logger.info("--y_absent provided, so remapping for XX individual")
y_present = False
else:
logger.error(
"--y_present or --y_absent required for --REMAPPING. "
"Exiting.")
logging.shutdown()
sys.exit(1)
if args.xx_ref_in is None or args.xy_ref_in is None:
logger.info(
"Input masked reference not provided for both "
"XX and XY mapping, so creating both")
masked_refs = ref_prep(
ref_obj=ref,
ref_mask=args.reference_mask,
ref_dir=reference_path,
xx=xx_out,
xy=xy_out,
y_chromosome=args.y_chromosome,
samtools_path=args.samtools_path,
bwa_path=args.bwa_path,
bwa_index=True)
else:
xx = reftools.RefFasta(
args.xx_ref_in, args.samtools_path, args.bwa_path)
xy = reftools.RefFasta(
args.xy_ref_in, args.samtools_path, args.bwa_path)
xx.conditional_index_bwa()
xx.conditional_seq_dict()
xy.conditional_index_bwa()
xy.conditional_seq_dict()
masked_refs = (xx, xy)
sex_chrom_bam = bam.BamFile(
remapping(
input_bam_obj=input_bam,
y_pres=y_present,
masked_references=masked_refs,
samtools_path=args.samtools_path,
sambamba_path=args.sambamba_path,
repairsh_path=args.repairsh_path,
shufflesh_path=args.shufflesh_path,
bwa_path=args.bwa_path,
bwa_flags=args.bwa_flags,
single_end=args.single_end,
bam_dir=bam_path,
fastq_dir=fastq_path,
sample_id=args.sample_id,
x_chromosome=args.x_chromosome,
y_chromosome=args.y_chromosome,
cpus=args.cpus,
xmx=args.xmx,
fastq_compression=args.fastq_compression,
cleanup=cleanup,
read_group_id=args.read_group_id),
args.samtools_path)
if args.sex_chrom_bam_only is True:
final_bam = sex_chrom_bam
else:
final_bam = bam.BamFile(
swap_sex_chroms(
input_bam_obj=input_bam,
new_bam_obj=sex_chrom_bam,
samtools_path=args.samtools_path,
sambamba_path=args.sambamba_path,
x_chromosome=args.x_chromosome,
y_chromosome=args.y_chromosome,
bam_dir=bam_path,
sample_id=args.sample_id,
cpus=args.cpus,
xyalign_params=xyalign_params_dict),
args.samtools_path)
logger.info("REMAPPING complete.")
logger.info("XYalign complete. Elapsed time: {} seconds".format(
time.time() - xyalign_start))
logging.shutdown()
sys.exit(0)
# Full Pipeline
else:
logger.info(
"Running entire XYalign pipeline")
missing_chroms = input_bam.check_chrom_in_bam(input_chromosomes)
if len(missing_chroms) != 0:
logger.error(
"One or more chromosomes provided via --chromosomes not "
"present in bam file. Exiting.")
logging.shutdown()
sys.exit(1)
if args.xx_ref_in is None or args.xy_ref_in is None:
logger.info(
"Input masked reference not provided for both "
"XX and XY mapping, so creating both")
masked_refs = ref_prep(
ref_obj=ref,
ref_mask=args.reference_mask,
ref_dir=reference_path,
xx=xx_out,
xy=xy_out,
y_chromosome=args.y_chromosome,
samtools_path=args.samtools_path,
bwa_path=args.bwa_path,
bwa_index=True)
else:
xx = reftools.RefFasta(
args.xx_ref_in, args.samtools_path, args.bwa_path)
xy = reftools.RefFasta(
args.xy_ref_in, args.samtools_path, args.bwa_path)
xx.conditional_index_bwa()
xx.conditional_seq_dict()
xy.conditional_index_bwa()
xy.conditional_seq_dict()
masked_refs = (xx, xy)
if args.platypus_calling == "both" or args.platypus_calling == "before":
call_variants = True
else:
call_variants = False
bam_dfs = bam_analysis(
input_bam_obj=input_bam,
platypus_calling=call_variants,
platypus_path=args.platypus_path,
vcf_log=noprocessing_vcf_log,
ref_obj=ref,
input_chroms=input_chromosomes,
cpus=args.cpus,
out_vcf=noprocessing_vcf,
no_variant_plots=args.no_variant_plots,
window_size=args.window_size,
target_bed=args.target_bed,
sample_id=args.sample_id,
readbalance_prefix=readbalance_prefix_noprocessing,
variant_site_quality=args.variant_site_quality,
variant_genotype_quality=args.variant_genotype_quality,
variant_depth=args.variant_depth,
marker_size=args.marker_size,
marker_transparency=args.marker_transparency,
homogenize_read_balance=args.homogenize_read_balance,
data_frame_readbalance=data_frame_readbalance_preprocessing,
min_variant_count=args.min_variant_count,
no_bam_analysis=args.no_bam_analysis,
ignore_duplicates=args.ignore_duplicates,
exact_depth=args.exact_depth,
whole_genome_threshold=args.whole_genome_threshold,
mapq_cutoff=args.mapq_cutoff,
min_depth_filter=args.min_depth_filter,
max_depth_filter=args.max_depth_filter,
depth_mapq_prefix=depth_mapq_prefix_noprocessing,
bam_data_frame=data_frame_preprocessing,
output_bed_high=output_bed_high_preprocessing,
output_bed_low=output_bed_low_preprocessing,
use_bed_for_platypus=False,
coordinate_scale=args.coordinate_scale,
fixed=args.include_fixed)
ploidy_results = ploidy_analysis(
passing_df=bam_dfs[0],
failing_df=bam_dfs[1],
no_perm_test=args.no_perm_test,
no_ks_test=args.no_ks_test,
no_bootstrap=args.no_bootstrap,
input_chroms=input_chromosomes,
x_chromosome=args.x_chromosome,
y_chromosome=args.y_chromosome,
results_dir=results_path,
num_permutations=args.num_permutations,
num_bootstraps=args.num_bootstraps,
sample_id=args.sample_id)
if args.y_present is True:
y_present = True
logger.info("--y_present provided, so remapping for XY individual")
elif args.y_absent is True:
y_present = False
logger.info("--y_absent provided, so remapping for XX individual")
else:
if ploidy_results["boot"][2][2] > args.sex_chrom_calling_threshold:
y_present = False
logger.info(
"X/Y depth ratio ({}) > {}. Y inferred to be absent.".format(
ploidy_results["boot"][2][2], args.sex_chrom_calling_threshold))
else:
y_present = True
logger.info(
"X/Y depth ratio ({}) <= {}. Y inferred to be present.".format(
ploidy_results["boot"][2][2], args.sex_chrom_calling_threshold))
sex_chrom_bam = bam.BamFile(
remapping(
input_bam_obj=input_bam,
y_pres=y_present,
masked_references=masked_refs,
samtools_path=args.samtools_path,
sambamba_path=args.sambamba_path,
repairsh_path=args.repairsh_path,
shufflesh_path=args.shufflesh_path,
bwa_path=args.bwa_path,
bwa_flags=args.bwa_flags,
single_end=args.single_end,
bam_dir=bam_path,
fastq_dir=fastq_path,
sample_id=args.sample_id,
x_chromosome=args.x_chromosome,
y_chromosome=args.y_chromosome,
cpus=args.cpus,
xmx=args.xmx,
fastq_compression=args.fastq_compression,
cleanup=cleanup,
read_group_id=args.read_group_id),
args.samtools_path)
if args.sex_chrom_bam_only is True:
final_bam = sex_chrom_bam
else:
final_bam = bam.BamFile(
swap_sex_chroms(
input_bam_obj=input_bam,
new_bam_obj=sex_chrom_bam,
samtools_path=args.samtools_path,
sambamba_path=args.sambamba_path,
x_chromosome=args.x_chromosome,
y_chromosome=args.y_chromosome,
bam_dir=bam_path,
sample_id=args.sample_id,
cpus=args.cpus,
xyalign_params=xyalign_params_dict),
args.samtools_path)
if args.platypus_calling == "both" or args.platypus_calling == "after":
call_variants = True
else:
call_variants = False
bam_analysis(
input_bam_obj=final_bam,
platypus_calling=call_variants,
platypus_path=args.platypus_path,
vcf_log=postprocessing_vcf_log,
ref_obj=ref,
input_chroms=input_chromosomes,
cpus=args.cpus,
out_vcf=postprocessing_vcf,
no_variant_plots=args.no_variant_plots,
window_size=args.window_size,
target_bed=args.target_bed,
sample_id=args.sample_id,
readbalance_prefix=readbalance_prefix_postprocessing,
variant_site_quality=args.variant_site_quality,
variant_genotype_quality=args.variant_genotype_quality,
variant_depth=args.variant_depth,
marker_size=args.marker_size,
marker_transparency=args.marker_transparency,
homogenize_read_balance=args.homogenize_read_balance,
data_frame_readbalance=data_frame_readbalance_postprocessing,
min_variant_count=args.min_variant_count,
no_bam_analysis=args.no_bam_analysis,
ignore_duplicates=args.ignore_duplicates,
exact_depth=args.exact_depth,
whole_genome_threshold=args.whole_genome_threshold,
mapq_cutoff=args.mapq_cutoff,
min_depth_filter=args.min_depth_filter,
max_depth_filter=args.max_depth_filter,
depth_mapq_prefix=depth_mapq_prefix_postprocessing,
bam_data_frame=data_frame_postprocessing,
output_bed_high=output_bed_high_postprocessing,
output_bed_low=output_bed_low_postprocessing,
use_bed_for_platypus=True,
coordinate_scale=args.coordinate_scale,
fixed=args.include_fixed)
logger.info("XYalign complete. Elapsed time: {} seconds".format(
time.time() - xyalign_start))
logging.shutdown()
sys.exit(0)
if __name__ == "__main__":
main()