Source code for xyalign.ploidy

# Part of XYalign
# Collection of functions related to ploidy estimation and chromosome depth
# comparison

from __future__ import division
from __future__ import print_function
import csv
import logging
import numpy as np
from scipy.stats import ks_2samp
import time


# Create logger for ploidy submodule
ploidy_logger = logging.getLogger("xyalign.ploidy")


[docs]def permutation_test_chromosomes( data_frame, first_chrom, second_chrom, chrom_column, value_column, num_perms, output_file=None): """ Runs a permutation test comparing mean values of two chromosomes. Parameters ---------- data_frame : pandas dataframe first_chrom : str The name of the first chromosome in comparison second_chrom : str The name of the second chromosome in comparison chrom_column : str The name of the column containing chromosome names value_column : str The name of the column containing the value of interest num_perms : int The number of permutations to use output_file : {str, None} If not None, will print results to this file Returns ------- tuple (mean of first chrom, mean of second chrom, p-value) """ perm_start = time.time() ploidy_logger.info( "Running permutation test ({} reps) comparing ratio of {} over {}".format( num_perms, first_chrom, second_chrom)) first_vals = data_frame[ data_frame[chrom_column] == first_chrom][value_column] second_vals = data_frame[ data_frame[chrom_column] == second_chrom][value_column] combined = np.append(first_vals, second_vals) first_mean = np.mean(first_vals) second_mean = np.mean(second_vals) observed = first_mean / second_mean perms = [] for i in range(0, num_perms): np.random.shuffle(combined) first = np.mean(combined[:len(first_vals)]) second = np.mean(combined[-len(second_vals):]) perms.append(first / second) perms = np.asarray(perms) sig = len(np.where(perms > observed)) / num_perms if output_file is not None: a = [ "{}_mean".format(first_chrom), "{}_mean".format(second_chrom), "{}_{}_ratio".format(first_chrom, second_chrom), "p_val_({}_/_{})".format(first_chrom, second_chrom), "perm_dist_2.5", "perm_dist_50", "perm_dist_97.5"] b = [ "{}".format(first_mean), "{}".format(second_mean), "{}".format(observed), "{}".format(sig), "{}".format(np.percentile(perms, 2.5)), "{}".format(np.percentile(perms, 50)), "{}".format(np.percentile(perms, 97.5))] with open(output_file, "w") as f: w = csv.writer(f, dialect="excel-tab") w.writerows([a, b]) ploidy_logger.info( "Permutations on {} and {} complete. Elapsed time: {} seconds".format( first_chrom, second_chrom, time.time() - perm_start)) return (first_mean, second_mean, sig)
[docs]def ks_two_sample( data_frame, first_chrom, second_chrom, chrom_column, value_column, output_file=None): """ Runs a Two-sample Kolmogorov-Smirnov test Parameters ---------- data_frame : pandas dataframe first_chrom : str The name of the first chromosome in comparison second_chrom : str The name of the second chromosome in comparison chrom_column : str The name of the column containing chromosome names value_column : str The name of the column containing the value of interest output_file : {str, None} If not None, will print results to this file. Returns ------- tuple (ks_statistic, ks_pvalue) """ ks_start = time.time() ploidy_logger.info( "Running KS two sample test on {} and {}".format( first_chrom, second_chrom)) first_vals = data_frame[ data_frame[chrom_column] == first_chrom][value_column] second_vals = data_frame[ data_frame[chrom_column] == second_chrom][value_column] first_mean = np.mean(first_vals) second_mean = np.mean(second_vals) mean_ratio = first_mean / second_mean result = ks_2samp(first_vals, second_vals) if output_file is not None: a = [ "{}_mean".format(first_chrom), "{}_mean".format(second_chrom), "{}_{}_ratio".format(first_chrom, second_chrom), "ks_statistic", "p_val"] b = [ "{}".format(first_mean), "{}".format(second_mean), "{}".format(mean_ratio), "{}".format(result[0]), "{}".format(result[1])] with open(output_file, "w") as f: w = csv.writer(f, dialect="excel-tab") w.writerows([a, b]) ploidy_logger.info( "KS two sample test on {} and {} complete. Elapsed time: {} seconds".format( first_chrom, second_chrom, time.time() - ks_start)) return result
[docs]def bootstrap( data_frame, first_chrom, second_chrom, chrom_column, value_column, num_reps, output_file=None): """ Bootstraps the 95 percent confidence interval of the mean ratio of measure for two chromosomes (chrom1 / chrom2). Parameters ---------- data_frame : pandas dataframe first_chrom : str The name of the first chromosome in comparison second_chrom : str The name of the second chromosome in comparison chrom_column : str The name of the column containing chromosome names value_column : str The name of the column containing the value of interest num_reps : int The number of bootstrap replicates to use output_file : {str, None} If not None, will print results to this file. Returns ------- tuple (mean ratio, 0.025 percentile, 0.975 percentile) """ boot_start = time.time() ploidy_logger.info( "Bootstrapping mean depth ratio of {} over {}".format( first_chrom, second_chrom)) first_vals = data_frame[ data_frame[chrom_column] == first_chrom][value_column] second_vals = data_frame[ data_frame[chrom_column] == second_chrom][value_column] first_vals = np.asarray(first_vals) second_vals = np.asarray(second_vals) first_mean = np.mean(first_vals) second_mean = np.mean(second_vals) mean_ratio = first_mean / second_mean samples = [] dim1 = len(first_vals) dim2 = len(second_vals) for i in range(0, num_reps): indices1 = np.random.random_integers(0, dim1 - 1, dim1) indices2 = np.random.random_integers(0, dim2 - 1, dim2) boot1 = np.take(first_vals, indices1) boot2 = np.take(second_vals, indices2) samples.append(np.mean(boot1) / np.mean(boot2)) samples = np.asarray(samples) if output_file is not None: a = [ "{}_mean".format(first_chrom), "{}_mean".format(second_chrom), "{}_{}_ratio".format(first_chrom, second_chrom), "boot_2.5_percentile", "boot_50_percentile", "boot_97.5_percentile"] b = [ "{}".format(first_mean), "{}".format(second_mean), "{}".format(mean_ratio), "{}".format(np.percentile(samples, 2.5)), "{}".format(np.percentile(samples, 50)), "{}".format(np.percentile(samples, 97.5))] with open(output_file, "w") as f: w = csv.writer(f, dialect="excel-tab") w.writerows([a, b]) ploidy_logger.info( "Bootstrapping of {} and {} (ratio) complete. " "Elapsed time: {} seconds".format( first_chrom, second_chrom, time.time() - boot_start)) return (mean_ratio, np.percentile(samples, 2.5), np.percentile(samples, 97.5))