Source code for xyalign.ploidy

# Part of XYalign
# Collection of functions related to ploidy estimation and chromosome depth
# comparison

from __future__ import division
from __future__ import print_function
import csv
import logging
import numpy as np
from scipy.stats import ks_2samp
import time


# Create logger for ploidy submodule
ploidy_logger = logging.getLogger("xyalign.ploidy")


[docs]def permutation_test_chromosomes(
	data_frame, first_chrom, second_chrom, chrom_column,
	value_column, num_perms, output_file=None):
	"""
	Runs a permutation test comparing mean values of two chromosomes.

	Parameters
	----------
	data_frame : pandas dataframe
	first_chrom : str
		The name of the first chromosome in comparison
	second_chrom : str
		The name of the second chromosome in comparison
	chrom_column : str
		The name of the column containing chromosome names
	value_column : str
		The name of the column containing the value of interest
	num_perms : int
		The number of permutations to use
	output_file : {str, None}
		If not None, will print results to this file

	Returns
	-------
	tuple
		(mean of first chrom, mean of second chrom, p-value)

	"""
	perm_start = time.time()
	ploidy_logger.info(
		"Running permutation test ({} reps) comparing ratio of {} over {}".format(
			num_perms, first_chrom, second_chrom))
	first_vals = data_frame[
		data_frame[chrom_column] == first_chrom][value_column]
	second_vals = data_frame[
		data_frame[chrom_column] == second_chrom][value_column]
	combined = np.append(first_vals, second_vals)

	first_mean = np.mean(first_vals)
	second_mean = np.mean(second_vals)

	observed = first_mean / second_mean
	perms = []
	for i in range(0, num_perms):
		np.random.shuffle(combined)
		first = np.mean(combined[:len(first_vals)])
		second = np.mean(combined[-len(second_vals):])
		perms.append(first / second)
	perms = np.asarray(perms)
	sig = len(np.where(perms > observed)) / num_perms
	if output_file is not None:
		a = [
			"{}_mean".format(first_chrom),
			"{}_mean".format(second_chrom),
			"{}_{}_ratio".format(first_chrom, second_chrom),
			"p_val_({}_/_{})".format(first_chrom, second_chrom),
			"perm_dist_2.5",
			"perm_dist_50",
			"perm_dist_97.5"]
		b = [
			"{}".format(first_mean),
			"{}".format(second_mean),
			"{}".format(observed),
			"{}".format(sig),
			"{}".format(np.percentile(perms, 2.5)),
			"{}".format(np.percentile(perms, 50)),
			"{}".format(np.percentile(perms, 97.5))]
		with open(output_file, "w") as f:
			w = csv.writer(f, dialect="excel-tab")
			w.writerows([a, b])
	ploidy_logger.info(
		"Permutations on {} and {} complete. Elapsed time: {} seconds".format(
			first_chrom, second_chrom, time.time() - perm_start))
	return (first_mean, second_mean, sig)


[docs]def ks_two_sample(
	data_frame, first_chrom, second_chrom, chrom_column,
	value_column, output_file=None):
	"""
	Runs a Two-sample Kolmogorov-Smirnov test

	Parameters
	----------

	data_frame : pandas dataframe
	first_chrom : str
		The name of the first chromosome in comparison
	second_chrom : str
		The name of the second chromosome in comparison
	chrom_column : str
		The name of the column containing chromosome names
	value_column : str
		The name of the column containing the value of interest
	output_file : {str, None}
		If not None, will print results to this file.

	Returns
	-------

	tuple
		(ks_statistic, ks_pvalue)

	"""
	ks_start = time.time()
	ploidy_logger.info(
		"Running KS two sample test on {} and {}".format(
			first_chrom, second_chrom))
	first_vals = data_frame[
		data_frame[chrom_column] == first_chrom][value_column]
	second_vals = data_frame[
		data_frame[chrom_column] == second_chrom][value_column]

	first_mean = np.mean(first_vals)
	second_mean = np.mean(second_vals)
	mean_ratio = first_mean / second_mean

	result = ks_2samp(first_vals, second_vals)

	if output_file is not None:
		a = [
			"{}_mean".format(first_chrom),
			"{}_mean".format(second_chrom),
			"{}_{}_ratio".format(first_chrom, second_chrom),
			"ks_statistic",
			"p_val"]
		b = [
			"{}".format(first_mean),
			"{}".format(second_mean),
			"{}".format(mean_ratio),
			"{}".format(result[0]),
			"{}".format(result[1])]

		with open(output_file, "w") as f:
			w = csv.writer(f, dialect="excel-tab")
			w.writerows([a, b])
	ploidy_logger.info(
		"KS two sample test on {} and {} complete. Elapsed time: {} seconds".format(
			first_chrom, second_chrom, time.time() - ks_start))
	return result


[docs]def bootstrap(
	data_frame, first_chrom, second_chrom, chrom_column,
	value_column, num_reps, output_file=None):
	"""
	Bootstraps the 95 percent confidence interval of the mean ratio of
	measure for two chromosomes (chrom1 / chrom2).

	Parameters
	----------

	data_frame : pandas dataframe
	first_chrom : str
		The name of the first chromosome in comparison
	second_chrom : str
		The name of the second chromosome in comparison
	chrom_column : str
		The name of the column containing chromosome names
	value_column : str
		The name of the column containing the value of interest
	num_reps : int
		The number of bootstrap replicates to use
	output_file : {str, None}
		If not None, will print results to this file.

	Returns
	-------
	tuple
		(mean ratio, 0.025 percentile, 0.975 percentile)
	"""
	boot_start = time.time()

	ploidy_logger.info(
		"Bootstrapping mean depth ratio of {} over {}".format(
			first_chrom, second_chrom))
	first_vals = data_frame[
		data_frame[chrom_column] == first_chrom][value_column]
	second_vals = data_frame[
		data_frame[chrom_column] == second_chrom][value_column]

	first_vals = np.asarray(first_vals)
	second_vals = np.asarray(second_vals)

	first_mean = np.mean(first_vals)
	second_mean = np.mean(second_vals)
	mean_ratio = first_mean / second_mean

	samples = []
	dim1 = len(first_vals)
	dim2 = len(second_vals)
	for i in range(0, num_reps):
		indices1 = np.random.random_integers(0, dim1 - 1, dim1)
		indices2 = np.random.random_integers(0, dim2 - 1, dim2)
		boot1 = np.take(first_vals, indices1)
		boot2 = np.take(second_vals, indices2)
		samples.append(np.mean(boot1) / np.mean(boot2))

	samples = np.asarray(samples)

	if output_file is not None:
		a = [
			"{}_mean".format(first_chrom),
			"{}_mean".format(second_chrom),
			"{}_{}_ratio".format(first_chrom, second_chrom),
			"boot_2.5_percentile",
			"boot_50_percentile",
			"boot_97.5_percentile"]
		b = [
			"{}".format(first_mean),
			"{}".format(second_mean),
			"{}".format(mean_ratio),
			"{}".format(np.percentile(samples, 2.5)),
			"{}".format(np.percentile(samples, 50)),
			"{}".format(np.percentile(samples, 97.5))]
		with open(output_file, "w") as f:
			w = csv.writer(f, dialect="excel-tab")
			w.writerows([a, b])
	ploidy_logger.info(
		"Bootstrapping of {} and {} (ratio) complete. "
		"Elapsed time: {} seconds".format(
			first_chrom, second_chrom, time.time() - boot_start))
	return (mean_ratio, np.percentile(samples, 2.5), np.percentile(samples, 97.5))