Source code for xyalign.utils

# Part of XYalign
# Collection of miscellaneous functions

from __future__ import division
from __future__ import print_function
import logging
import os
import subprocess
import numpy as np
import pandas as pd
import pybedtools
import time
# Matplotlib needs to be called in this way to set the display variable
import matplotlib
matplotlib.use('Agg')
from matplotlib import pyplot as plt

matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['ps.fonttype'] = 42


# Create logger for utils submodule
utils_logger = logging.getLogger("xyalign.utils")


[docs]def validate_external_prog(prog_path, prog_name):
	"""
	Checks to see if external program can be called using provided path

	Parameters
	----------

	prog_path: path to call program
	prog_name: name of program

	Returns
	-------

	int
		0

	"""
	try:
		a = subprocess.Popen(
			[prog_path], stderr=subprocess.PIPE, stdout=subprocess.PIPE)
		utils_logger.info(
			"{} successfully called using path: {}".format(prog_name, prog_path))
	except OSError as e:
		utils_logger.error(
			"ERROR: {} not available from path: {}".format(prog_name, prog_path))
		logging.shutdown()
		raise OSError("ERROR: {} not available from path: {}".format(
			prog_name, prog_path))
	return 0


[docs]def validate_dir(parent_dir, dir_name):
	"""
	Checks if directory exists and if not, creates it.

	Parameters
	----------

	parent_dir : Parent directory name
	dir_name : Name of the new directory

	Returns
	-------

	bool
		whether the directory existed

	"""
	full_path = os.path.join(parent_dir, dir_name)
	exists = os.path.exists(full_path)
	if not exists:
		os.makedirs(full_path)
	return exists


[docs]def check_bam_fasta_compatibility(bam_object, fasta_object):
	"""
	Checks to see if bam and fasta sequence names and lengths are
	equivalent (i.e., if it is likely that the bam file was generated
	using the fasta in question).

	Parameters
	----------

	bam_object : BamFile() object
	fasta_object: RefFasta() object

	Returns
	-------

	bool
		True if sequence names and lengths match. False otherwise.

	"""
	utils_logger.info(
		"Checking compatibility of {} and {}".format(
			bam_object.filepath, fasta_object.filepath))

	b_names = bam_object.chromosome_names()
	f_names = fasta_object.chromosome_names()
	b_lengths = bam_object.chromosome_lengths()
	f_lengths = fasta_object.chromosome_lengths()

	if b_lengths == f_lengths:
		if b_names == f_names:
			utils_logger.info("Bam and Fasta are compatible")
			return True
		else:
			utils_logger.error(
				"Bam and Fasta are incompatible.  Sequence lengths are identical "
				"but names are not.  Check chromosome ids to ensure compatibility. "
				"{} ids are: {}\n"
				"{} ids are: {}".format(
					bam_object.filepath, b_names,
					fasta_object.filepath, f_names))
			return False
	else:
		utils_logger.error(
			"Bam and Fasta are incompatible. Check sequence names and lengths "
			"to ensure correct fasta was used as input. Chromosome ids and "
			"lengths for {} are:\n"
			"{}\n"
			"Chromosome ids and lengths for {} are:\n"
			"{}".format(
				bam_object.filepath, zip(b_names, b_lengths),
				fasta_object.filepath, zip(f_names, f_lengths)))
		return False


[docs]def check_compatibility_bam_list(bam_obj_list):
	"""
	Checks to see if bam sequence names and lengths are
	equivalent (i.e., if it is likely that the bam files were generated
	using the same reference genome).

	Parameters
	----------

	bam_obj_list : list
		List of bam.BamFile() objects

	Returns
	-------

	bool
		True if sequence names and lengths match. False otherwise.

	"""
	utils_logger.info(
		"Checking compatibility of: ".format(
			", ".join([x.filepath for x in bam_obj_list])))

	seq_names = [x.chromosome_names() for x in bam_obj_list]
	seq_lengths = [x.chromosome_lengths() for x in bam_obj_list]

	# Checking for ANY incompatibility
	uniq_seq_names = [x for x in seq_names if x != seq_names[0]]
	uniq_seq_lens = [x for x in seq_lengths if x != seq_lengths[0]]

	if len(uniq_seq_names) == 0 and len(uniq_seq_lens) == 0:
		return True
	else:
		utils_logger.info("Bam files contain different headers.")
		return False


[docs]def merge_bed_files(output_file, *bed_files):
	"""
	This function simply takes an output_file (full path to desired output file)
	and an arbitrary number of external bed files (including full path),
	and merges the bed files into the output_file

	Parameters
	----------

	output_file : str
		Full path to and name of desired output bed file
	*bed_files
		Variable length argument list of external bed files (include full path)

	Returns
	-------

	str
		path to output_file

	"""
	merged_bed_start = time.time()
	utils_logger.info("Merging bed files: {}".format(
		" ".join(bed_files)))
	a = pybedtools.BedTool(bed_files[0])
	for i in bed_files[1:]:
		b = a.cat(i)
	c = b.sort().merge()
	c.saveas(output_file)
	utils_logger.info(
		"Merging bed files complete. Elapsed time: {} seconds".format(
			time.time() - merged_bed_start))
	return output_file


[docs]def make_region_lists_genome_filters(
	depthAndMapqDf, mapqCutoff, min_depth, max_depth):
	"""
	Filters a pandas dataframe for mapq and depth based on using all values
	from across the entire genome

	Parameters
	----------

	depthAndMapqDf : pandas dataframe
		Must have 'depth' and 'mapq' columns
	mapqCutoff : int
		The minimum mapq for a window to be considered high quality
	min_depth : float
		Fraction of mean to set as minimum depth
	max_depth : float
		Multiple of mean to set as maximum depth

	Returns
	-------

	tuple
		(passing dataframe, failing dataframe)
	"""
	make_region_lists_start = time.time()
	depth_mean = depthAndMapqDf["depth"].mean()
	depth_sd = depthAndMapqDf["depth"].std()

	depthMin = depth_mean * min_depth
	depthMax = depth_mean * max_depth

	utils_logger.info(
		"Filtering dataframe for mapq (MAPQ >= {}) "
		"and depth (between {} and {})".format(
			mapqCutoff, depthMin, depthMax))

	good = (
		(depthAndMapqDf["mapq"] >= mapqCutoff) &
		(depthAndMapqDf["depth"] > depthMin) &
		(depthAndMapqDf["depth"] < depthMax))
	dfGood = depthAndMapqDf[good]
	dfBad = depthAndMapqDf[~good]

	utils_logger.info("Filtering complete. Elapsed time: {} seconds".format(
		time.time() - make_region_lists_start))
	return (dfGood, dfBad)


[docs]def make_region_lists_chromosome_filters(
	depthAndMapqDf, mapqCutoff, min_depth, max_depth):
	"""
	Filters a pandas dataframe for mapq and depth based on thresholds calculated
	per chromosome

	Parameters
	----------

	depthAndMapqDf : pandas dataframe
		Must have 'depth' and 'mapq' columns
	mapqCutoff : int
		The minimum mapq for a window to be considered high quality
	min_depth : float
		Fraction of mean to set as minimum depth
	max_depth : float
		Multiple of mean to set as maximum depth

	Returns
	-------

	tuple
		(passing dataframe, failing dataframe)
	"""
	make_region_lists_start = time.time()
	utils_logger.info(
		"Calculating filtering thresholds for mapq and depth on a per "
		"chromosome basis")

	indices = np.unique(depthAndMapqDf.chrom, return_index=True)[1]
	ordered_chrom_list = [
		depthAndMapqDf.chrom[index] for index in sorted(indices)]

	good_list = []
	bad_list = []

	for i in ordered_chrom_list:
		df = depthAndMapqDf.loc[depthAndMapqDf["chrom"] == i]
		depth_mean = df["depth"].mean()
		depth_sd = df["depth"].std()

		depthMin = depth_mean * min_depth
		depthMax = depth_mean * max_depth

		utils_logger.info(
			"Filtering chromosome {} for mapq (MAPQ >= {}) "
			"and depth (between {} and {})".format(
				i, mapqCutoff, depthMin, depthMax))

		good = (
			(df["chrom"] == i) &
			(df["mapq"] >= mapqCutoff) &
			(df["depth"] > depthMin) &
			(df["depth"] < depthMax))
		good_list.append(df[good])
		bad_list.append(df[~good])
	dfGood = pd.concat(good_list)
	dfBad = pd.concat(bad_list)

	utils_logger.info("Filtering complete. Elapsed time: {} seconds".format(
		time.time() - make_region_lists_start))
	return (dfGood, dfBad)


[docs]def output_bed(outBed, *regionDfs):
	"""
	Concatenate and merges dataframes into an output bed file

	Parameters
	----------

	outBed : str
		The full path to and name of the output bed file
	*regionDfs
		Variable length list of dataframes to be included

	Returns
	-------

	int
		0

	"""
	dfComb = pd.concat(regionDfs)
	regionList = dfComb.ix[:, "chrom":"stop"].values.tolist()
	merge = pybedtools.BedTool(regionList).sort().merge()
	with open(outBed, 'w') as output:
		output.write(str(merge))
	return 0


[docs]def output_bed_no_merge(outBed, *regionDfs):
	"""
	Concatenate dataframes into an output bed file. This will preserve all
	columns after the first three as well.

	Parameters
	----------

	outBed : str
		The full path to and name of the output bed file
	*regionDfs
		Variable length list of dataframes to be included

	Returns
	-------

	int
		0

	"""
	dfComb = pd.concat(regionDfs)
	regionList = dfComb.ix[:,:].values.tolist()
	regionList_str = [[str(y) for y in x] for x in regionList]
	sorted = pybedtools.BedTool(regionList_str).sort()
	with open(outBed, 'w') as output:
		output.write(str(sorted))
	return 0


[docs]def chromosome_wide_plot(
	chrom, positions, y_value, measure_name, sampleID, output_prefix,
	MarkerSize, MarkerAlpha, Xlim, Ylim, x_scale=1000000):
	"""
	Plots values across a chromosome, where the x axis is the position along the
	chromosome and the Y axis is the value of the measure of interest.

	Parameters
	----------

	chrom : str
		Name of the chromosome
	positions : numpy array
		Genomic coordinates
	y_value : numpy	array
		The values of the measure of interest
	measure_name : str
		The name of the measure of interest (y axis title)
	sampleID : str
		The name of the sample
	output_prefix : str
		Full path to and prefix of desired output plot
	MarkerSize : float
		Size in points^2
	MarkerAlpha : float
		Transparency (0 to 1)
	Xlim : float
		Maximum X value
	Ylim : float
		Maximum Y value
	x_scale : int
		Divide all x values (including Xlim) by this value. Default is 1000000 (1MB)

	Returns
	-------

	int
		0

	"""
	if "x" in chrom.lower():
		Color = "green"
	elif "y" in chrom.lower():
		Color = "blue"
	else:
		Color = "red"
	fig = plt.figure(figsize=(15, 5))
	axes = fig.add_subplot(111)
	positions = np.divide(positions, float(x_scale))
	axes.scatter(
		positions, y_value, c=Color, alpha=MarkerAlpha, s=MarkerSize, lw=0)
	axes.set_xlim(0, (Xlim / float(x_scale)))
	axes.set_ylim(0, Ylim)
	axes.set_title("%s - %s" % (sampleID, chrom))
	if x_scale == 1000000:
		scale_label = "(MB)"
	elif x_scale == 1000:
		scale_label = "(KB)"
	elif x_scale == 1:
		scale_label = "(BP)"
	else:
		scale_label = "(divided by {})".formatt(x_scale)
	axes.set_xlabel("Chromosomal Position {}".format(scale_label))
	axes.set_ylabel(measure_name)
	plt.savefig("{}_{}_{}_GenomicScatter.pdf".format(
		output_prefix, chrom, measure_name), transparent=True)
	# plt.savefig("{}_{}_{}_GenomicScatter.svg".format(
	# 	output_prefix, chrom, measure_name))
	# plt.savefig("{}_{}_{}_GenomicScatter.png".format(
	# 	output_prefix, chrom, measure_name))
	plt.close(fig)
	return 0


[docs]def hist_array(chrom, value_array, measure_name, sampleID, output_prefix):
	"""
	Plots a histogram of an array of values of interest. Intended for mapq and
	depth, but generalizeable.  Separate function from variants.hist_read_balance
	because that function eliminates fixed variants, while this function will
	plot all values.

	Parameters
	----------

	chrom : str
		Name of the chromosome
	value_array : numpy array
		Read balance values
	measure_name : str
		The name of the measure of interest (y axis title)
	sampleID : str
		Sample name or id to include in the plot title
	output_prefix : str
		Desired prefix (including full path) of the output files

	Returns
	-------

	int
		0 if plotting successful, 1 otherwise.

	"""
	if len(value_array) < 1:
		utils_logger.info(
			"No {} values on {} to plot histogram. Skipping.".format(
				measure_name, chrom))
		return 1
	else:
		value_array = value_array[~np.isnan(value_array)]
		if "x" in chrom.lower():
			Color = "green"
		elif "y" in chrom.lower():
			Color = "blue"
		else:
			Color = "red"
		fig = plt.figure(figsize=(8, 8))
		axes = fig.add_subplot(111)
		axes.set_title("{} - {}".format(sampleID, chrom))
		axes.set_xlabel("{}".format(measure_name))
		axes.set_ylabel("Frequency")
		axes.hist(value_array, bins=50, color=Color)
		plt.savefig("{}_{}_{}_Hist.pdf".format(
			output_prefix, chrom, measure_name), transparent=True)
		# plt.savefig("{}_{}_{}_Hist.svg".format(output_prefix, chrom, measure_name))
		# plt.savefig("{}_{}_{}_Hist.png".format(output_prefix, chrom, measure_name))
		plt.close(fig)
		utils_logger.info(
			"{} histogram of {} complete.".format(measure_name, chrom))
		return 0


[docs]def plot_depth_mapq(
	window_df, output_prefix, sampleID, chrom_length, MarkerSize,
	MarkerAlpha, x_scale=1000000):
	"""
	Creates histograms and genome-wide plots of various metrics.

	Parameters
	----------

	window_df : pandas dataframe
		Columns must include chrom, start, depth, and mapq (at least)
	output_prefix : str
		Path and prefix of output files to create
	sampleID : str
		Sample ID
	chrom_length: int
		Length of chromosome
	x_scale : int
		Divide all x values (including Xlim) by this value for chromosome_wide_plot.
		Default is 1000000 (1MB)

	Returns
	-------

	int
		0

	"""
	chromosome = window_df["chrom"][1]

	# Create genome-wide plots based on window means
	if window_df is not None:
		# depth plot
		chromosome_wide_plot(
			chromosome, window_df["start"].values, window_df["depth"].values,
			"Depth", sampleID, output_prefix,
			MarkerSize, MarkerAlpha,
			chrom_length, 100, x_scale)
		hist_array(
			chromosome, window_df["depth"], "Depth", sampleID, output_prefix)

		# mapping quality plot
		chromosome_wide_plot(
			chromosome, window_df["start"].values, window_df["mapq"].values,
			"Mapq", sampleID, output_prefix,
			MarkerSize, MarkerAlpha, chrom_length, 80, x_scale)
		hist_array(
			chromosome, window_df["mapq"], "Mapq", sampleID, output_prefix)

	return 0


[docs]def before_after_plot(
	chrom, positions, values_before, values_after, measure_name, sampleID,
	output_prefix, MarkerSize, MarkerAlpha, Xlim,
	YMin="auto", YMax="auto", x_scale=1000000, Color="black"):
	"""
	Plots difference between before/after values (after minus before) across
	a chromosome.

	Parameters
	----------

	chrom : str
		Name of the chromosome
	positions : numpy array
		Genomic coordinates
	values_before : numpy array
		The values of the measure of interest in the "before" condidtion
	values_after : numpy array
		The values of the measure of interest in the "after" condidtion
	measure_name : str
		The name of the measure of interest (for y-axis title)
	sampleID : str
		The name of the sample
	output_prefix : str
		Full path to and prefix of desired output plot
	MarkerSize : float
		Size in points^2
	MarkerAlpha : float
		Transparency (0 to 1)
	Xlim : float
		Maximum X value
	YMin : str, int, or float
		If "auto", will allow matplotlib to automatically determine limit. Otherwise,
		will set the y axis minimum to the value provided (int or float)
	YMax : str, int, or float
		If "auto", will allow matplotlib to automatically determine limit. Otherwise,
		will set the y axis maximum to the value provided (int or float)
	x_scale : int
		Divide all x values (including Xlim) by this value. Default is 1000000 (1MB)
	Color : str
		Color to use for points. See matplotlib documentation for acceptable options

	Returns
	-------

	int
		0 if plotting successful, 1 otherwise
	"""
	# Check that lengths are identical
	if len(values_before) != len(values_after):
		utils_logger.error(
			"Error. values_before and values_after must have the same length")
		return 1
	else:
		value_array = np.nan_to_num(values_after) - np.nan_to_num(values_before)
		fig = plt.figure(figsize=(15, 5))
		axes = fig.add_subplot(111)
		positions = np.divide(positions, float(x_scale))
		axes.scatter(
			positions, value_array, c=Color, alpha=MarkerAlpha, s=MarkerSize, lw=0)
		axes.set_xlim(0, (Xlim / float(x_scale)))
		if YMin != "auto":
			if "." in YMin:
				YMin = float(YMin)
			else:
				YMin = int(YMin)
			if YMax != "auto":
				if "." in YMax:
					YMax = float(YMax)
				else:
					YMax = int(YMax)
				axes.ylim((YMin, YMax))
			else:
				axes.ylim(ymin=YMin)
		elif YMax != "auto":
				if "." in YMax:
					YMax = float(YMax)
				else:
					YMax = int(YMax)
				axes.ylim(ymax=YMax)
		axes.set_title("%s - %s" % (sampleID, chrom))
		if x_scale == 1000000:
			scale_label = "(MB)"
		elif x_scale == 1000:
			scale_label = "(KB)"
		elif x_scale == 1:
			scale_label = "(BP)"
		else:
			scale_label = "(divided by {})".formatt(x_scale)
		axes.set_xlabel("Chromosomal Position {}".format(scale_label))
		axes.set_ylabel("Difference in {}".format(measure_name))
		plt.savefig("{}_{}_{}_BeforeAfterScatter.pdf".format(
			output_prefix, chrom, measure_name), transparent=True)
		# plt.savefig("{}_{}_{}_BeforeAfterScatter.svg".format(
		# 	output_prefix, chrom, measure_name))
		# plt.savefig("{}_{}_{}_BeforeAfterScatter.png".format(
		# 	output_prefix, chrom, measure_name))
		plt.close(fig)
		return 0