Source code for xyalign.variants

# Part of XYalign
# Functions for calling and processing variants

from __future__ import absolute_import
from __future__ import division
import csv
import gzip
import logging
import subprocess
import time
import xyalign.utils as utils
# from xyalign import utils
import numpy as np
import pandas as pd
# Matplotlib needs to be called in this way to set the display variable
import matplotlib
matplotlib.use('Agg')
from matplotlib import pyplot as plt

matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['ps.fonttype'] = 42

# Create logger for variants submodule
variants_logger = logging.getLogger("xyalign.variants")


[docs]class VCFFile(): """ A class for working with external vcf files. Attributes ---------- filepath : str Full path to external vcf file bgzip : str Full path to bgzip. Default = 'bgzip' tabix : str Full path to tabix. Default = "tabix" """ def __init__( self, filepath, bgzip="bgzip", tabix="tabix", no_initial_compress=False): self.filepath = filepath self.bgzip = bgzip self.tabix = tabix self.logger = logging.getLogger("xyalign.variants.VCFFile") self.logger.info("Creating a VCFFile instance for {}".format( self.filepath)) if no_initial_compress is False: if self.is_bgzipped() is False: self.compress_vcf() self.index_vcf()
[docs] def is_bgzipped(self): """ Checks to see if vcf file is gzipped, simply by looking for a .gz or .bgz ending. If .gz or .bgz ending exists, assumes file is compressed using bgzip. Returns ------- bool True if ends in .gz, False otherwise """ self.logger.info("Checking for .gz or .bgz ending in {}".format( self.filepath)) if self.filepath[-3:] == ".gz": self.logger.info("Ends in .gz") return True elif self.filepath[-4:] == ".bgz": self.logger.info("Ends in .bgz") return True else: self.logger.info("Does not end in .gz or .bgz") return False
[docs] def compress_vcf(self): """ Compresses vcf file using bgzip. Returns ------- bool True if successful Raises ------- RuntimeError If return code from external call is not 0 """ self.logger.info("Compressing vcf file {}".format(self.filepath)) bgzip_start = time.time() rc = subprocess.call([self.bgzip, "-f", self.filepath]) if rc == 0: self.logger.info("Compression complete. Elapsed time: {} seconds".format( time.time() - bgzip_start)) self.filepath = self.filepath + ".gz" return True else: self.logger.error("Unable to compress vcf file: {}. Exiting.".format( self.filepath)) logging.shutdown() raise RuntimeError("Unable to compress vcf file. Exiting.")
[docs] def index_vcf(self): """ Indexes vcf file using tabix. If file does not end in .gz, will compress with bgzip (by calling self.compress_vcf). Note: Files MUST be compressed using bgzip. Returns ------- bool True if successful. Raises ------ RuntimeError If return code from external call is not 0. """ self.logger.info("Indexing vcf file: {}".format(self.filepath)) index_start = time.time() rc = subprocess.call([self.tabix, "-f", "-p", "vcf", self.filepath]) if rc == 0: self.logger.info("Indexing complete. Elapsed time: {} seconds.".format( time.time() - index_start)) return True else: self.logger.info("Unable to index vcf file: {}. Exiting".format( self.filepath)) logging.shutdown() raise RuntimeError("unable to index vcf file. Exiting.")
[docs] def parse_platypus_VCF(self, site_qual, genotype_qual, depth, chrom): """ Parse vcf generated by Platypus to grab read balance. Note that this is hard-coded to Platypus (version 0.8.1) and will not generalize to vcfs generated with other programs (and, potentially, other versions of Platypus) Parameters ---------- site_qual : int Minimum (PHRED) site quality at which sites should be included genotype_qual : int Minimum (PHRED) genotype quality at which sites should be included depth : int Minimum depth at which sites should be included chrom : str Name of the chromosome to include Returns ------- tuple five corresponding arrays of the same length: (position across the chromosome, site quality, read balance, genotype quality, and depth) """ parse_start = time.time() self.logger.info("Parsing {} for read balance.".format(self.filepath)) positions = [] quality = [] readBalance = [] gqs = [] dps = [] # Right now, cyvcf2 (and the htslib have trouble reading platypus vcfs), # so currently hard coding parsing. # for variant in cyvcf2.VCF(self.filepath): # pos = variant.start # qual = variant.QUAL # if qual < site_qual: # continue # if variant.format("GQ")[0] < genotype_qual: # continue # TC = variant.INFO.get("TC") # TR = variant.INFO.get("TR") # if TR == 0 or TC == 0: # continue # if TR + TC < depth: # continue if self.is_bgzipped() is True: infile = gzip.open("{}".format(self.filepath), 'rb') else: infile = open("{}".format(self.filepath), 'r') for line in infile: cols = line.strip('\n').split('\t') if cols[0] != chrom: continue pos = int(cols[1]) qual = float(cols[5]) # Check site quality if qual < site_qual: continue # Check genotype quality try: GQ = int(cols[9].split(':')[3]) except IndexError: self.logger.error("Error parsing line: {}".format(line)) continue if GQ < genotype_qual: continue # Grab allele depths and check for depth try: TR = cols[7].split(';')[17].split('=')[1] TC = cols[7].split(';')[14].split('=')[1] except IndexError: self.logger.error("Error parsing line: {}".format(line)) continue if ',' in TR or ',' in TC: continue TR = float(TR) TC = float(TC) if TR == 0 or TC == 0: continue if TC < depth: continue # Site passes filters - grab the read ratio ReadRatio = TR / TC # Add to arrays readBalance.append(ReadRatio) positions.append(pos) quality.append(qual) gqs.append(GQ) dps.append(TC) infile.close() self.logger.info("Parsing complete. Elapsed time: {} seconds".format( time.time() - parse_start)) return (positions, quality, readBalance, gqs, dps)
[docs] def plot_variants_per_chrom( self, chrom_list, sampleID, output_prefix, site_qual, genotype_qual, depth, MarkerSize, MarkerAlpha, bamfile_obj, variant_caller, homogenize, dataframe_out, min_count, window_size, x_scale=1000000, target_file=None, include_fixed=False): """ Parses a vcf file and plots read balance in separate plots for each chromosome in the input list Parameters ---------- chrom_list : list Chromosomes to include sampleID : str Sample ID (for plot titles) output_prefix : str Full path to and prefix of desired output plots site_qual : int Minimum (PHRED) site quality at which sites should be included genotype_qual : int Minimum (PHRED) genotype quality at which sites should be included depth : int Minimum depth at which sites should be included MarkerSize : float Size of markers (matplotlib sizes) to use in the figure MarkerAlpha : float Transparency (matplotlib values, 0 to 1) of markers bamfile_obj : BamFile() object Used to get chromosome lengths only variant_caller : str Variant caller used to generate vcf - currently only "platypus" supported homogenize: bool If True, all read balance values less than 0.5 will be transformed by subtracting the value from 1. For example, the values 0.25 and 0.75 would be treated as equivalent. dataframe_out : str Full path of file to write pandas dataframe to. Will overwire if exists min_count : int Minimum number of variants to include a window for plotting. window_size If int, the window size to use for sliding window analyses, if None intervals from target_file x_scale : int Divide all x values (including Xlim) by this value. Default is 1000000 (1MB) target_file : str Path to bed_file containing regions to analyze instead of windows of a fixed size. Will only be engaged if window_size is None include_fixed : bool If False, only plots histogram for values between 0.05 and 1.0. If True, plots histogram of all variants. Returns ------- int 0 if variants to analyze; 1 if no variants to analyze on any chromosome """ plot_start = time.time() self.logger.info("Plotting read balance from {} for chroms: {}".format( self.filepath, " ".join(chrom_list))) if variant_caller.lower() != "platypus": self.logger.error( "Error. Only 'platypus' currently supported as variant_caller.") logging.shutdown() raise RuntimeError( "Error. Only 'platypus' currently supported as variant_caller " "for plot_variants_per_chrom.") no_sites = [] all_df = [] for i in chrom_list: parse_results = self.parse_platypus_VCF( site_qual, genotype_qual, depth, i) if len(parse_results[0]) < 1: no_sites.append(i) else: chrom_len = bamfile_obj.get_chrom_length(i) plot_read_balance( i, parse_results[0], parse_results[2], sampleID, output_prefix, MarkerSize, MarkerAlpha, homogenize, chrom_len, x_scale) hist_read_balance( i, parse_results[2], sampleID, homogenize, output_prefix) rb_df = read_balance_per_window( i, parse_results[0], parse_results[2], sampleID, homogenize, chrom_len, window_size, target_file) all_df.append(rb_df) utils.chromosome_wide_plot( i, rb_df["start"].values, rb_df["count"], "Window_Variant_Count", sampleID, output_prefix, MarkerSize, MarkerAlpha, chrom_len, rb_df["count"].max(), x_scale) rb_df = rb_df[rb_df["count"] >= min_count] utils.chromosome_wide_plot( i, rb_df["start"].values, rb_df["balance"], "Window_Read_Balance", sampleID, output_prefix, MarkerSize, MarkerAlpha, chrom_len, 1.0, x_scale) if len(all_df) < 1: self.logger.error( "No chromosomes with any variants present") return 1 else: all_concat = pd.concat(all_df) all_concat.to_csv( dataframe_out, index=False, sep="\t", quoting=csv.QUOTE_NONE) if len(no_sites) >= 1: self.logger.info( "No variants passing filters on the following chromosomes: {}".format( " ".join(no_sites))) else: self.logger.info( "All chromosomes had variant sites passing filters.") self.logger.info( "Read balance plotting complete. Elapsed time: {} seconds".format( time.time() - plot_start)) return 0
[docs]def read_balance_per_window( chrom, positions, readBalance, sampleID, homogenize, chr_len, window_size, target_file=None): """ Calculates mean read balance per genomic window (defined by size or an external target bed file) for a given chromosome. Takes as input an array of positions and an array of read balances - the order of which must correspond exactly. In addition, the positions are expected to ALL BE ON THE SAME CHROMOSOME and be in numerically sorted order (i.e., the output of parse_platypus_VCF()) Parameters ---------- chrom : str Name of the chromosome positions : numpy array Positions along the chromosome (same length as readBalance) readBalance : numpy array Read balance corresponding with the positions in the positions array sampleID : str Sample name or id to include in the plot title homogenize: bool If True, all read balance values less than 0.5 will be transformed by subtracting the value from 1. For example, the values 0.25 and 0.75 would be treated as equivalent. chr_len : int Length of chromosome. Ignored if target_file is provided. window_size If int, the window size to use for sliding window analyses, if None intervals from target_file target_file : str Path to bed file containing regions to analyze instead of windows of a fixed size. Will only be engaged if window_size is None Returns ------- pandas dataframe With columns: "chrom", "start", "stop", "balance", and "count" """ readbalance_start = time.time() variants_logger.info( "Traversing {} to calculate mean read balance".format( chrom)) positions = np.asarray(positions) readBalance = np.asarray(readBalance) if window_size is not None: variants_logger.info( "Using windows size: {}".format(window_size)) num_windows = chr_len // window_size + 1 if chr_len % num_windows == 0: last_window_len = window_size else: last_window_len = chr_len % window_size chr_list = [] start_list = [] stop_list = [] balance_list = [] count_list = [] window_id = 0 start = 0 end = window_size window_count = 0 window_balances = [] for idx, i in enumerate(positions): if i < start: variants_logger.info( "Position {} is less than window start {}. Check that " "positions are sorted numerically. Exiting.".format(i, start)) raise RuntimeError( "Position {} is less than window start {}. Check that " "positions are sorted numerically. Exiting.".format(i, start)) elif i < end: window_count += 1 if homogenize is False: window_balances.append(readBalance[idx]) else: if readBalance[idx] > 0.5: window_balances.append(readBalance[idx]) else: window_balances.append(1.0 - readBalance[idx]) else: # i is >= end, so window needs to be reset while i >= end: chr_list.append(chrom) start_list.append(start) stop_list.append(end) if len(window_balances) == 0: balance_list.append(0) else: balance_list.append(np.mean(window_balances)) count_list.append(window_count) window_id += 1 if window_id > num_windows: variants_logger.info( "Exhausted windows, but positions still remaining. " "Ensure correct chromosome provided and that positions " "are numerically sorted.") raise RuntimeError( "Exhausted windows, but positions still remaining. " "Ensure correct chromosome provided and that positions " "are numerically sorted.") if window_id == num_windows - 1: start += window_size end += last_window_len else: start += window_size end += window_size window_balances = [] window_count = 0 window_count += 1 if homogenize is False: window_balances.append(readBalance[idx]) else: if readBalance[idx] > 0.5: window_balances.append(readBalance[idx]) else: window_balances.append(1.0 - readBalance[idx]) # process last window chr_list.append(chrom) start_list.append(start) stop_list.append(end) if len(window_balances) == 0: balance_list.append(0) else: balance_list.append(np.mean(window_balances)) count_list.append(window_count) elif target_file is not None: variants_logger.info( "Using targets from: {}".format(target_file)) with open(target_file) as f: targets = [x.strip() for x in f] targets = [x.split() for x in targets] targets = [x for x in targets if x[0] == chrom] while [""] in targets: targets.remove([""]) num_windows = len(targets) chr_list = [chrom] * num_windows start_list = [] stop_list = [] balance_list = [] count_list = [] window_id = 0 window_count = 0 window_balances = [] start = int(targets[0][1]) end = int(targets[0][2]) num_pos = len(positions) pos_idx = 0 while window_id < num_windows: if pos_idx < num_pos: if positions[pos_idx] < start: pos_idx += 1 elif positions[pos_idx] < end: window_count += 1 if homogenize is False: window_balances.append(readBalance[pos_idx]) else: if readBalance[pos_idx] > 0.5: window_balances.append(readBalance[pos_idx]) else: window_balances.append(1.0 - readBalance[pos_idx]) pos_idx += 1 else: start_list.append(start) stop_list.append(end) if len(window_balances) == 0: balance_list.append(0) else: balance_list.append(np.mean(window_balances)) count_list.append(window_count) window_id += 1 try: start = int(targets[window_id][1]) end = int(targets[window_id][2]) except IndexError: break window_count = 0 window_balances = [] else: start_list.append(start) stop_list.append(end) if len(window_balances) == 0: balance_list.append(0) else: balance_list.append(np.mean(window_balances)) count_list.append(window_count) window_id += 1 try: start = int(targets[window_id][1]) end = int(targets[window_id][2]) except IndexError: break window_count = 0 window_balances = [] else: variants_logger.error( "Both window_size and target_file set to None. " "Cannot proceed with read balance traversal. Exiting.") logging.shutdown() raise RuntimeError( "Both window_size and target_file set to None. " "Cannot proceed with read balance traversal. Exiting.") # Convert data into pandas data frames windows_df = pd.DataFrame({ "chrom": np.asarray(chr_list), "start": np.asarray(start_list), "stop": np.asarray(stop_list), "balance": np.asarray(balance_list), "count": np.asarray(count_list) })[["chrom", "start", "stop", "balance", "count"]] variants_logger.info("Analysis complete. Elapsed time: {} seconds".format( time.time() - readbalance_start)) return windows_df
[docs]def plot_read_balance( chrom, positions, readBalance, sampleID, output_prefix, MarkerSize, MarkerAlpha, homogenize, chrom_len, x_scale=1000000): """ Plots read balance at each SNP along a chromosome Parameters ---------- chrom : str Name of the chromosome positions : numpy array Positions along the chromosome (same length as readBalance) readBalance : numpy array Read balance corresponding with the positions in the positions array sampleID : str Sample name or id to include in the plot title output_prefix : str Desired prefix (including full path) of the output files MarkerSize : float Size of markers (matplotlib sizes) to use in the figure MarkerAlpha : float Transparency (matplotlib values) of markers for the figure homogenize: bool If True, all read balance values less than 0.5 will be transformed by subtracting the value from 1. For example, the values 0.25 and 0.75 would be treated as equivalent. chrom_len : int Length of chromosome x_scale : int Divide all x values (including Xlim) by this value. Default is 1000000 (1MB) Returns ------- int 0 """ if homogenize is True: readBalance = np.asarray([1.0 - x if x < 0.5 else x for x in readBalance]) if "x" in chrom.lower(): Color = "green" elif "y" in chrom.lower(): Color = "blue" else: Color = "red" fig = plt.figure(figsize=(15, 5)) axes = fig.add_subplot(111) positions = np.divide(positions, float(x_scale)) axes.scatter( positions, readBalance, c=Color, alpha=MarkerAlpha, s=MarkerSize, lw=0) axes.set_xlim(0, (chrom_len / float(x_scale))) axes.set_title(sampleID) if x_scale == 1000000: scale_label = "(MB)" elif x_scale == 1000: scale_label = "(KB)" elif x_scale == 1: scale_label = "(BP)" else: scale_label = "(divided by {})".format(x_scale) axes.set_xlabel("Chromosomal Position {}".format(scale_label)) axes.set_ylabel("Read Balance") # print(len(positions)) plt.savefig("{}_{}_ReadBalance_GenomicScatter.pdf".format( output_prefix, chrom), transparent=True) # plt.savefig("{}_{}_ReadBalance_GenomicScatter.svg".format( # output_prefix, chrom)) # plt.savefig("{}_{}_ReadBalance_GenomicScatter.png".format( # output_prefix, chrom)) plt.close(fig) variants_logger.info("Genomic read balance plot of {} complete.".format( chrom)) return 0
[docs]def hist_read_balance( chrom, readBalance, sampleID, homogenize, output_prefix, include_fixed=False): """ Plots a histogram of read balance values between 0.05 and 1.0 (non-incusive) Parameters ---------- chrom : str Name of the chromosome readBalance : list or numpy array Read balance values sampleID : str Sample name or id to include in the plot title homogenize: bool If True, all read balance values less than 0.5 will be transformed by subtracting the value from 1. For example, the values 0.25 and 0.75 would be treated as equivalent. output_prefix : str Desired prefix (including full path) of the output files include_fixed : bool If False, only plots histogram for values between 0.05 and 1.0. If True, plots histogram of all variants. Returns ------- int 0 if plotting successful, 1 otherwise. """ readBalance = np.asarray(readBalance) if include_fixed is False: read_balance = readBalance[ np.where((readBalance > 0.05) & (readBalance < 1))] else: read_balance = readBalance if len(read_balance) == 0: variants_logger.info( "No sites on {} to plot histogram. Skipping.".format(chrom)) return 1 if homogenize is True: read_balance = np.asarray([1.0 - x if x < 0.5 else x for x in read_balance]) if "x" in chrom.lower(): Color = "green" elif "y" in chrom.lower(): Color = "blue" else: Color = "red" fig = plt.figure(figsize=(8, 8)) axes = fig.add_subplot(111) axes.set_title(sampleID) axes.set_xlabel("Read Balance") axes.set_ylabel("Frequency") axes.hist(read_balance, bins=50, color=Color) plt.savefig("{}_{}_ReadBalance_Hist.pdf".format( output_prefix, chrom), transparent=True) # plt.savefig("{}_{}_ReadBalance_Hist.svg".format(output_prefix, chrom)) # plt.savefig("{}_{}_ReadBalance_Hist.png".format(output_prefix, chrom)) plt.close(fig) variants_logger.info( "Genomic read balance histogram of {} complete.".format( chrom)) return 0