# coalescent ibd simulator for paper
# slow multiple coalesce, wf process based on
# one with on/off buttons for merge, prune
# these are much slower, hence the name slow.py
# importing relevant packages
import numpy as np
import pandas as pd
import networkx as nx
from scipy.stats import binom
from random import randint, sample
from math import floor, exp, log, ceil
from numpy.random import exponential as Exp
from .coalescent import *
[docs]def empty_function():
return None
def two_randint(n):
x = randint(0,n)
y = randint(0,n)
while x == y:
y = randint(0,n)
return sorted([x,y])
### multiple coalesce, wf process only ###
class WfDirectedPaths:
def __init__(self, _id, _left_length, _right_length):
self._left_length = _left_length
self._right_length = _right_length
self._num_leaves = 1
self._leading_leaf = _id
return None
def add_edge(self, _left_length, _right_length):
self._left_length = min(self._left_length, _left_length)
self._right_length = min(self._right_length, _right_length)
return None
class WfNode:
def __init__(self, _id, _time, _left_length, _right_length):
self._id = _id
self._time = _time
if _time <= 0:
self._paths_list = [WfDirectedPaths(_id, _left_length, _right_length)]
self._num_leaves = 1
else:
self._paths_list = []
self._num_leaves = 0
self._num_tracts = 0
return None
def __repr__(self):
return 'WfNode(_id = ' + str(self._id) + ')'
def __str__(self):
return 'WfNode(_id = ' + str(self._id) + ')'
def add_incoming_node(self, node):
self._num_leaves += node._num_leaves
self._num_tracts += node._num_tracts
return None
def add_incoming_edge(self, scale):
_left_length = Exp(scale=scale, size=1)[0]
_right_length = Exp(scale=scale, size=1)[0]
for paths in self._paths_list:
paths.add_edge(_left_length, _right_length)
return None
def merge(self):
paths_list = self._paths_list
merged = set() # merge set
doskip = set() # skip set
k = len(paths_list)
for i in range(k):
if i not in doskip:
merged.add(i)
for j in range(i+1,k):
if j not in doskip:
paths1 = paths_list[i]
paths2 = paths_list[j]
if paths1._left_length == paths2._left_length:
if paths1._right_length == paths2._right_length:
m1 = paths1._num_leaves
m2 = paths2._num_leaves
m = m1 + m2
paths_list[i]._num_leaves = m
paths_list[i]._leading_leaf = min(paths1._leading_leaf, paths2._leading_leaf)
doskip.add(j)
# merging
self._paths_list = [paths_list[i] for i in merged]
return None
def prune_paths(self, cutoff):
self._paths_list = [paths for paths in self._paths_list if paths._left_length + paths._right_length >= cutoff]
return None
def multiple_coalesce(self, nodes, cutoff1, cutoff2):
# initializing
nodes = [node for node in nodes]
deltas = [self._time - node._time for node in nodes]
scales = [100 / delta for delta in deltas]
# updating node info
for i in range(len(nodes)):
nodes[i].add_incoming_edge(scales[i])
nodes[i].prune_paths(cutoff2)
nodes[i].merge()
# pairwise comparing
for i in range(len(nodes)-1):
for j in range(i+1,len(nodes)):
_num_tracts, node1, node2 = wf_pairwise_compare(nodes[i], nodes[j], cutoff1, cutoff2)
nodes[i] = node1
nodes[j] = node2
self._num_tracts += _num_tracts
# passing node info up
for i in range(len(nodes)):
self.add_incoming_node(nodes[i])
for paths in nodes[i]._paths_list:
self._paths_list.append(paths)
return None
def wf_pairwise_compare(node1, node2, cutoff1, cutoff2):
# initializing
global H
_num_tracts = 0
K = len(node1._paths_list)
L = len(node2._paths_list)
for k in range(K):
i = node1._paths_list[k]
for l in range(L):
j = node2._paths_list[l]
_left_length = min(i._left_length, j._left_length)
_right_length = min(i._right_length, j._right_length)
_length = _right_length + _left_length
# networking
if _length >= cutoff2:
H.add_edge(i._leading_leaf, j._leading_leaf)
if i._leading_leaf <= j._leading_leaf:
node2._paths_list[l]._leading_leaf = node1._paths_list[k]._leading_leaf
else:
node1._paths_list[k]._leading_leaf = node2._paths_list[l]._leading_leaf
# segment counting
if _length >= cutoff1:
m1 = i._num_leaves
m2 = j._num_leaves
density = m1 * m2
_num_tracts += density
return _num_tracts, node1, node2
def simulate_ibd_wf(n, Ne,
ibd_min_span=2.0, ibs_min_span=1.0,
left_length=np.inf, right_length=np.inf
):
'''ibd segments from a wright fisher process (slower but exact)
Parameters
----------
n : int
Haploid sample size
Ne : dict
Effective population sizes
ibd_min_span : float
cM length threshold
ibs_min_span : float
cM length threshold
left_length : float
Distance to the left chromosome end (default numpy.inf)
right_length : float
Distance to the right chromosome end (default numpy.inf)
Returns
-------
tuple
(Number of detected IBD segments, Size of largest IBD cluster)
'''
# initialize graph network
global H
H = nx.Graph()
# initalize
n = int(float(n))
interiors = [WfNode(i, 0, left_length, right_length) for i in range(n)]
itr = n
timer = 1
end = max(Ne.keys())
while timer <= end:
# record coalescent events
size = Ne[timer]
draw = [randint(0, size - 1) for i in range(len(interiors))]
table = {}
for i in range(len(draw)):
try:
table[draw[i]].append(i)
except KeyError:
table[draw[i]] = [i]
events = [val for key, val in table.items() if len(val) >= 2]
# record pairwise ibd
drop = []
for merge in events:
# incoporate incoming node data into next level interior node
itr += 1
new_node = WfNode(itr, timer, left_length, right_length)
nodes = []
for i in merge:
nodes.append(interiors[i])
drop.append(i)
new_node.multiple_coalesce(nodes, ibd_min_span, ibs_min_span)
# multiple coalescent, often 2
interiors.append(new_node)
# dropping old nodes
drop = sorted(drop)
popr = 0
for d in drop:
interiors.pop(d + popr)
popr -= 1
timer += 1
numTracts = sum([node._num_tracts for node in interiors])
maxIbdGroup = max([len(c) for c in nx.connected_components(H)])
return numTracts, maxIbdGroup
def simulate_ibd_isweep_wf(n, s, p0, Ne,
ibd_min_span = 2, ibs_min_span = 1.0,
random = True, model = 'm', tau0 = 0, ploidy = 2,
left_length=np.inf, right_length=np.inf
):
'''ibd segments from a wright fisher process with selection (slower but exact)
Parameters
----------
n : int
Haploid sample size
s : float
Selection coefficient
p0 : float
Variant frequency at generation 0
Ne : dict
Effective population sizes
ibd_min_span : float
cM length threshold
ibs_min_span : float
cM length threshold
random : bool
True for random walk
model : str
'm', 'a', 'd', or 'r'
tau0 : int
Generation when neutrality begins
ploidy : int
Assume diploid
left_length : float
Distance to the left chromosome end (default numpy.inf)
right_length : float
Distance to the right chromosome end (default numpy.inf)
Returns
-------
tuple
(Number of detected IBD segments, Size of largest IBD cluster, Number of detected IBD segments with different alleles)
'''
# local function
def extend_vector(vec, val, mxn):
'''Extend NumPy array
Parameters
----------
vec : array-like
NumPy array
val : float
Impute value
mxn : int
Array size
Returns
-------
array-like
NumPy array
'''
num = mxn - (len(vec) - 1)
ext = np.array([val for i in range(num)])
return np.concatenate((vec, ext))
global H
H = nx.Graph()
# renaming
p = p0
t = tau0
mdl = model
stoc = random
# calculating structured demographies
ps, Ns, xs = walk_variant_backward(s, p, Ne, stoc, mdl, tau0, ploidy)
n = int(float(n))
n = n * ploidy
G = max(Ne.keys())
n1 = floor(n * ps[0])
n0 = n - n1
N1 = [ceil(i) for i in ploidy * Ns * ps]
N1 = {i:N1[i] for i in range(len(N1))}
qs = extend_vector(1 - ps, 1, G)
Ns = np.array(list(Ne.values()))
N0 = [ceil(i) for i in ploidy * Ns * qs]
N0 = {i:N0[i] for i in range(len(N0))}
# population 1
timer = 1
end = max(N1.keys())
itr = n1
interiors1 = [WfNode(i, 0, left_length, right_length) for i in range(n1)]
while timer <= end:
# record coalescent events
size = N1[timer]
draw = [randint(0, size - 1) for i in range(len(interiors1))]
table = {}
for i in range(len(draw)):
try:
table[draw[i]].append(i)
except KeyError:
table[draw[i]] = [i]
events = [val for key, val in table.items() if len(val) >= 2]
# record pairwise ibd
drop = []
for merge in events:
# incoporate incoming node data into next level interior node
itr += 1
new_node = WfNode(itr, timer, left_length, right_length)
nodes = []
for i in merge:
nodes.append(interiors1[i])
drop.append(i)
new_node.multiple_coalesce(nodes, ibd_min_span, ibs_min_span)
# multiple coalescent, often 2
interiors1.append(new_node)
# dropping old nodes
drop = sorted(drop)
popr = 0
for d in drop:
interiors1.pop(d + popr)
popr -= 1
timer += 1
numTracts1 = sum([node._num_tracts for node in interiors1])
# population 0
timer = 1
itr += 1
interiors0 = [WfNode(i, 0, left_length, right_length) for i in range(itr, itr + n0)]
itr += n0
while timer <= end:
# record coalescent events
size = N0[timer]
draw = [randint(0, size - 1) for i in range(len(interiors0))]
table = {}
for i in range(len(draw)):
try:
table[draw[i]].append(i)
except KeyError:
table[draw[i]] = [i]
events = [val for key, val in table.items() if len(val) >= 2]
# record pairwise ibd
drop = []
for merge in events:
# incoporate incoming node data into next level interior node
itr += 1
new_node = WfNode(itr, timer, left_length, right_length)
nodes = []
for i in merge:
nodes.append(interiors0[i])
drop.append(i)
new_node.multiple_coalesce(nodes, ibd_min_span, ibs_min_span)
# multiple coalescent, often 2
interiors0.append(new_node)
# dropping old nodes
drop = sorted(drop)
popr = 0
for d in drop:
interiors0.pop(d + popr)
popr -= 1
timer += 1
numTracts0 = sum([node._num_tracts for node in interiors0])
# pre adaptation
itr += 1
end = max(Ne.keys())
interiors = interiors0 + interiors1
while timer <= end:
# record coalescent events
size = Ne[timer]
draw = [randint(0, size - 1) for i in range(len(interiors))]
table = {}
for i in range(len(draw)):
try:
table[draw[i]].append(i)
except KeyError:
table[draw[i]] = [i]
events = [val for key, val in table.items() if len(val) >= 2]
# record pairwise ibd
drop = []
for merge in events:
# incoporate incoming node data into next level interior node
itr += 1
new_node = WfNode(itr, timer, left_length, right_length)
nodes = []
for i in merge:
nodes.append(interiors[i])
drop.append(i)
new_node.multiple_coalesce(nodes, ibd_min_span, ibs_min_span)
# multiple coalescent, often 2
interiors.append(new_node)
# dropping old nodes
drop = sorted(drop)
popr = 0
for d in drop:
interiors.pop(d + popr)
popr -= 1
timer += 1
numTracts = sum([node._num_tracts for node in interiors])
maxIbdGroup = max([len(c) for c in nx.connected_components(H)])
return numTracts, maxIbdGroup, numTracts1, numTracts0