# Part of XYalign
# Collection of miscellaneous functions
from __future__ import division
from __future__ import print_function
import logging
import os
import subprocess
import numpy as np
import pandas as pd
import pybedtools
import time
# Matplotlib needs to be called in this way to set the display variable
import matplotlib
matplotlib.use('Agg')
from matplotlib import pyplot as plt
matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['ps.fonttype'] = 42
# Create logger for utils submodule
utils_logger = logging.getLogger("xyalign.utils")
[docs]def validate_external_prog(prog_path, prog_name):
"""
Checks to see if external program can be called using provided path
Parameters
----------
prog_path: path to call program
prog_name: name of program
Returns
-------
int
0
"""
try:
a = subprocess.Popen(
[prog_path], stderr=subprocess.PIPE, stdout=subprocess.PIPE)
utils_logger.info(
"{} successfully called using path: {}".format(prog_name, prog_path))
except OSError as e:
utils_logger.error(
"ERROR: {} not available from path: {}".format(prog_name, prog_path))
logging.shutdown()
raise OSError("ERROR: {} not available from path: {}".format(
prog_name, prog_path))
return 0
[docs]def validate_dir(parent_dir, dir_name):
"""
Checks if directory exists and if not, creates it.
Parameters
----------
parent_dir : Parent directory name
dir_name : Name of the new directory
Returns
-------
bool
whether the directory existed
"""
full_path = os.path.join(parent_dir, dir_name)
exists = os.path.exists(full_path)
if not exists:
os.makedirs(full_path)
return exists
[docs]def check_bam_fasta_compatibility(bam_object, fasta_object):
"""
Checks to see if bam and fasta sequence names and lengths are
equivalent (i.e., if it is likely that the bam file was generated
using the fasta in question).
Parameters
----------
bam_object : BamFile() object
fasta_object: RefFasta() object
Returns
-------
bool
True if sequence names and lengths match. False otherwise.
"""
utils_logger.info(
"Checking compatibility of {} and {}".format(
bam_object.filepath, fasta_object.filepath))
b_names = bam_object.chromosome_names()
f_names = fasta_object.chromosome_names()
b_lengths = bam_object.chromosome_lengths()
f_lengths = fasta_object.chromosome_lengths()
if b_lengths == f_lengths:
if b_names == f_names:
utils_logger.info("Bam and Fasta are compatible")
return True
else:
utils_logger.error(
"Bam and Fasta are incompatible. Sequence lengths are identical "
"but names are not. Check chromosome ids to ensure compatibility. "
"{} ids are: {}\n"
"{} ids are: {}".format(
bam_object.filepath, b_names,
fasta_object.filepath, f_names))
return False
else:
utils_logger.error(
"Bam and Fasta are incompatible. Check sequence names and lengths "
"to ensure correct fasta was used as input. Chromosome ids and "
"lengths for {} are:\n"
"{}\n"
"Chromosome ids and lengths for {} are:\n"
"{}".format(
bam_object.filepath, zip(b_names, b_lengths),
fasta_object.filepath, zip(f_names, f_lengths)))
return False
[docs]def check_compatibility_bam_list(bam_obj_list):
"""
Checks to see if bam sequence names and lengths are
equivalent (i.e., if it is likely that the bam files were generated
using the same reference genome).
Parameters
----------
bam_obj_list : list
List of bam.BamFile() objects
Returns
-------
bool
True if sequence names and lengths match. False otherwise.
"""
utils_logger.info(
"Checking compatibility of: ".format(
", ".join([x.filepath for x in bam_obj_list])))
seq_names = [x.chromosome_names() for x in bam_obj_list]
seq_lengths = [x.chromosome_lengths() for x in bam_obj_list]
# Checking for ANY incompatibility
uniq_seq_names = [x for x in seq_names if x != seq_names[0]]
uniq_seq_lens = [x for x in seq_lengths if x != seq_lengths[0]]
if len(uniq_seq_names) == 0 and len(uniq_seq_lens) == 0:
return True
else:
utils_logger.info("Bam files contain different headers.")
return False
[docs]def merge_bed_files(output_file, *bed_files):
"""
This function simply takes an output_file (full path to desired output file)
and an arbitrary number of external bed files (including full path),
and merges the bed files into the output_file
Parameters
----------
output_file : str
Full path to and name of desired output bed file
*bed_files
Variable length argument list of external bed files (include full path)
Returns
-------
str
path to output_file
"""
merged_bed_start = time.time()
utils_logger.info("Merging bed files: {}".format(
" ".join(bed_files)))
a = pybedtools.BedTool(bed_files[0])
for i in bed_files[1:]:
b = a.cat(i)
c = b.sort().merge()
c.saveas(output_file)
utils_logger.info(
"Merging bed files complete. Elapsed time: {} seconds".format(
time.time() - merged_bed_start))
return output_file
[docs]def make_region_lists_genome_filters(
depthAndMapqDf, mapqCutoff, min_depth, max_depth):
"""
Filters a pandas dataframe for mapq and depth based on using all values
from across the entire genome
Parameters
----------
depthAndMapqDf : pandas dataframe
Must have 'depth' and 'mapq' columns
mapqCutoff : int
The minimum mapq for a window to be considered high quality
min_depth : float
Fraction of mean to set as minimum depth
max_depth : float
Multiple of mean to set as maximum depth
Returns
-------
tuple
(passing dataframe, failing dataframe)
"""
make_region_lists_start = time.time()
depth_mean = depthAndMapqDf["depth"].mean()
depth_sd = depthAndMapqDf["depth"].std()
depthMin = depth_mean * min_depth
depthMax = depth_mean * max_depth
utils_logger.info(
"Filtering dataframe for mapq (MAPQ >= {}) "
"and depth (between {} and {})".format(
mapqCutoff, depthMin, depthMax))
good = (
(depthAndMapqDf["mapq"] >= mapqCutoff) &
(depthAndMapqDf["depth"] > depthMin) &
(depthAndMapqDf["depth"] < depthMax))
dfGood = depthAndMapqDf[good]
dfBad = depthAndMapqDf[~good]
utils_logger.info("Filtering complete. Elapsed time: {} seconds".format(
time.time() - make_region_lists_start))
return (dfGood, dfBad)
[docs]def make_region_lists_chromosome_filters(
depthAndMapqDf, mapqCutoff, min_depth, max_depth):
"""
Filters a pandas dataframe for mapq and depth based on thresholds calculated
per chromosome
Parameters
----------
depthAndMapqDf : pandas dataframe
Must have 'depth' and 'mapq' columns
mapqCutoff : int
The minimum mapq for a window to be considered high quality
min_depth : float
Fraction of mean to set as minimum depth
max_depth : float
Multiple of mean to set as maximum depth
Returns
-------
tuple
(passing dataframe, failing dataframe)
"""
make_region_lists_start = time.time()
utils_logger.info(
"Calculating filtering thresholds for mapq and depth on a per "
"chromosome basis")
indices = np.unique(depthAndMapqDf.chrom, return_index=True)[1]
ordered_chrom_list = [
depthAndMapqDf.chrom[index] for index in sorted(indices)]
good_list = []
bad_list = []
for i in ordered_chrom_list:
df = depthAndMapqDf.loc[depthAndMapqDf["chrom"] == i]
depth_mean = df["depth"].mean()
depth_sd = df["depth"].std()
depthMin = depth_mean * min_depth
depthMax = depth_mean * max_depth
utils_logger.info(
"Filtering chromosome {} for mapq (MAPQ >= {}) "
"and depth (between {} and {})".format(
i, mapqCutoff, depthMin, depthMax))
good = (
(df["chrom"] == i) &
(df["mapq"] >= mapqCutoff) &
(df["depth"] > depthMin) &
(df["depth"] < depthMax))
good_list.append(df[good])
bad_list.append(df[~good])
dfGood = pd.concat(good_list)
dfBad = pd.concat(bad_list)
utils_logger.info("Filtering complete. Elapsed time: {} seconds".format(
time.time() - make_region_lists_start))
return (dfGood, dfBad)
[docs]def output_bed(outBed, *regionDfs):
"""
Concatenate and merges dataframes into an output bed file
Parameters
----------
outBed : str
The full path to and name of the output bed file
*regionDfs
Variable length list of dataframes to be included
Returns
-------
int
0
"""
dfComb = pd.concat(regionDfs)
regionList = dfComb.ix[:, "chrom":"stop"].values.tolist()
merge = pybedtools.BedTool(regionList).sort().merge()
with open(outBed, 'w') as output:
output.write(str(merge))
return 0
[docs]def output_bed_no_merge(outBed, *regionDfs):
"""
Concatenate dataframes into an output bed file. This will preserve all
columns after the first three as well.
Parameters
----------
outBed : str
The full path to and name of the output bed file
*regionDfs
Variable length list of dataframes to be included
Returns
-------
int
0
"""
dfComb = pd.concat(regionDfs)
regionList = dfComb.ix[:,:].values.tolist()
regionList_str = [[str(y) for y in x] for x in regionList]
sorted = pybedtools.BedTool(regionList_str).sort()
with open(outBed, 'w') as output:
output.write(str(sorted))
return 0
[docs]def chromosome_wide_plot(
chrom, positions, y_value, measure_name, sampleID, output_prefix,
MarkerSize, MarkerAlpha, Xlim, Ylim, x_scale=1000000):
"""
Plots values across a chromosome, where the x axis is the position along the
chromosome and the Y axis is the value of the measure of interest.
Parameters
----------
chrom : str
Name of the chromosome
positions : numpy array
Genomic coordinates
y_value : numpy array
The values of the measure of interest
measure_name : str
The name of the measure of interest (y axis title)
sampleID : str
The name of the sample
output_prefix : str
Full path to and prefix of desired output plot
MarkerSize : float
Size in points^2
MarkerAlpha : float
Transparency (0 to 1)
Xlim : float
Maximum X value
Ylim : float
Maximum Y value
x_scale : int
Divide all x values (including Xlim) by this value. Default is 1000000 (1MB)
Returns
-------
int
0
"""
if "x" in chrom.lower():
Color = "green"
elif "y" in chrom.lower():
Color = "blue"
else:
Color = "red"
fig = plt.figure(figsize=(15, 5))
axes = fig.add_subplot(111)
positions = np.divide(positions, float(x_scale))
axes.scatter(
positions, y_value, c=Color, alpha=MarkerAlpha, s=MarkerSize, lw=0)
axes.set_xlim(0, (Xlim / float(x_scale)))
axes.set_ylim(0, Ylim)
axes.set_title("%s - %s" % (sampleID, chrom))
if x_scale == 1000000:
scale_label = "(MB)"
elif x_scale == 1000:
scale_label = "(KB)"
elif x_scale == 1:
scale_label = "(BP)"
else:
scale_label = "(divided by {})".formatt(x_scale)
axes.set_xlabel("Chromosomal Position {}".format(scale_label))
axes.set_ylabel(measure_name)
plt.savefig("{}_{}_{}_GenomicScatter.pdf".format(
output_prefix, chrom, measure_name), transparent=True)
# plt.savefig("{}_{}_{}_GenomicScatter.svg".format(
# output_prefix, chrom, measure_name))
# plt.savefig("{}_{}_{}_GenomicScatter.png".format(
# output_prefix, chrom, measure_name))
plt.close(fig)
return 0
[docs]def hist_array(chrom, value_array, measure_name, sampleID, output_prefix):
"""
Plots a histogram of an array of values of interest. Intended for mapq and
depth, but generalizeable. Separate function from variants.hist_read_balance
because that function eliminates fixed variants, while this function will
plot all values.
Parameters
----------
chrom : str
Name of the chromosome
value_array : numpy array
Read balance values
measure_name : str
The name of the measure of interest (y axis title)
sampleID : str
Sample name or id to include in the plot title
output_prefix : str
Desired prefix (including full path) of the output files
Returns
-------
int
0 if plotting successful, 1 otherwise.
"""
if len(value_array) < 1:
utils_logger.info(
"No {} values on {} to plot histogram. Skipping.".format(
measure_name, chrom))
return 1
else:
value_array = value_array[~np.isnan(value_array)]
if "x" in chrom.lower():
Color = "green"
elif "y" in chrom.lower():
Color = "blue"
else:
Color = "red"
fig = plt.figure(figsize=(8, 8))
axes = fig.add_subplot(111)
axes.set_title("{} - {}".format(sampleID, chrom))
axes.set_xlabel("{}".format(measure_name))
axes.set_ylabel("Frequency")
axes.hist(value_array, bins=50, color=Color)
plt.savefig("{}_{}_{}_Hist.pdf".format(
output_prefix, chrom, measure_name), transparent=True)
# plt.savefig("{}_{}_{}_Hist.svg".format(output_prefix, chrom, measure_name))
# plt.savefig("{}_{}_{}_Hist.png".format(output_prefix, chrom, measure_name))
plt.close(fig)
utils_logger.info(
"{} histogram of {} complete.".format(measure_name, chrom))
return 0
[docs]def plot_depth_mapq(
window_df, output_prefix, sampleID, chrom_length, MarkerSize,
MarkerAlpha, x_scale=1000000):
"""
Creates histograms and genome-wide plots of various metrics.
Parameters
----------
window_df : pandas dataframe
Columns must include chrom, start, depth, and mapq (at least)
output_prefix : str
Path and prefix of output files to create
sampleID : str
Sample ID
chrom_length: int
Length of chromosome
x_scale : int
Divide all x values (including Xlim) by this value for chromosome_wide_plot.
Default is 1000000 (1MB)
Returns
-------
int
0
"""
chromosome = window_df["chrom"][1]
# Create genome-wide plots based on window means
if window_df is not None:
# depth plot
chromosome_wide_plot(
chromosome, window_df["start"].values, window_df["depth"].values,
"Depth", sampleID, output_prefix,
MarkerSize, MarkerAlpha,
chrom_length, 100, x_scale)
hist_array(
chromosome, window_df["depth"], "Depth", sampleID, output_prefix)
# mapping quality plot
chromosome_wide_plot(
chromosome, window_df["start"].values, window_df["mapq"].values,
"Mapq", sampleID, output_prefix,
MarkerSize, MarkerAlpha, chrom_length, 80, x_scale)
hist_array(
chromosome, window_df["mapq"], "Mapq", sampleID, output_prefix)
return 0
[docs]def before_after_plot(
chrom, positions, values_before, values_after, measure_name, sampleID,
output_prefix, MarkerSize, MarkerAlpha, Xlim,
YMin="auto", YMax="auto", x_scale=1000000, Color="black"):
"""
Plots difference between before/after values (after minus before) across
a chromosome.
Parameters
----------
chrom : str
Name of the chromosome
positions : numpy array
Genomic coordinates
values_before : numpy array
The values of the measure of interest in the "before" condidtion
values_after : numpy array
The values of the measure of interest in the "after" condidtion
measure_name : str
The name of the measure of interest (for y-axis title)
sampleID : str
The name of the sample
output_prefix : str
Full path to and prefix of desired output plot
MarkerSize : float
Size in points^2
MarkerAlpha : float
Transparency (0 to 1)
Xlim : float
Maximum X value
YMin : str, int, or float
If "auto", will allow matplotlib to automatically determine limit. Otherwise,
will set the y axis minimum to the value provided (int or float)
YMax : str, int, or float
If "auto", will allow matplotlib to automatically determine limit. Otherwise,
will set the y axis maximum to the value provided (int or float)
x_scale : int
Divide all x values (including Xlim) by this value. Default is 1000000 (1MB)
Color : str
Color to use for points. See matplotlib documentation for acceptable options
Returns
-------
int
0 if plotting successful, 1 otherwise
"""
# Check that lengths are identical
if len(values_before) != len(values_after):
utils_logger.error(
"Error. values_before and values_after must have the same length")
return 1
else:
value_array = np.nan_to_num(values_after) - np.nan_to_num(values_before)
fig = plt.figure(figsize=(15, 5))
axes = fig.add_subplot(111)
positions = np.divide(positions, float(x_scale))
axes.scatter(
positions, value_array, c=Color, alpha=MarkerAlpha, s=MarkerSize, lw=0)
axes.set_xlim(0, (Xlim / float(x_scale)))
if YMin != "auto":
if "." in YMin:
YMin = float(YMin)
else:
YMin = int(YMin)
if YMax != "auto":
if "." in YMax:
YMax = float(YMax)
else:
YMax = int(YMax)
axes.ylim((YMin, YMax))
else:
axes.ylim(ymin=YMin)
elif YMax != "auto":
if "." in YMax:
YMax = float(YMax)
else:
YMax = int(YMax)
axes.ylim(ymax=YMax)
axes.set_title("%s - %s" % (sampleID, chrom))
if x_scale == 1000000:
scale_label = "(MB)"
elif x_scale == 1000:
scale_label = "(KB)"
elif x_scale == 1:
scale_label = "(BP)"
else:
scale_label = "(divided by {})".formatt(x_scale)
axes.set_xlabel("Chromosomal Position {}".format(scale_label))
axes.set_ylabel("Difference in {}".format(measure_name))
plt.savefig("{}_{}_{}_BeforeAfterScatter.pdf".format(
output_prefix, chrom, measure_name), transparent=True)
# plt.savefig("{}_{}_{}_BeforeAfterScatter.svg".format(
# output_prefix, chrom, measure_name))
# plt.savefig("{}_{}_{}_BeforeAfterScatter.png".format(
# output_prefix, chrom, measure_name))
plt.close(fig)
return 0