-
Notifications
You must be signed in to change notification settings - Fork 0
/
prepare_data.py
132 lines (96 loc) · 3.12 KB
/
prepare_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
"""
Prepare windows of distance marices
"""
import numpy as np
import glob
import os
from pandas import DataFrame
from itertools import combinations
# ------ Setup ------
path_blocks = 'data/in/'
path_estim = 'data/estim/'
file_snps = 'data/snps_names.txt'
if not os.path.exists(path_blocks):
# Create a new directory because it does not exist
os.makedirs(path_blocks)
# Setup for blocks of SNPs
s_step = 10**6
s_window = 3*s_step
# Setup for frequencies
pop_region = ['ETHI', 'LEB', 'TUR', 'UZB', 'IND', 'MOR']
pop_type = ['desi', 'kabuli']
movement_type = 'routes'
tol = 0.01
# ------ Get SNPs coordinates ------
snps_names = []
with open(file_snps, 'r') as file:
for line in file: # reading each line
for word in line.split(): # reading each word
snps_names += [word]
snps_pos = []
snps_chr = []
for sn in snps_names:
if (sn[0] == 'C') and (sn[1] == 'a'):
tmp = sn.split('.')
chr = int(tmp[0][2])
pos = int(tmp[1].split('_')[0])
snps_pos += [pos]
snps_chr += [chr]
else:
snps_pos += [0]
snps_chr += [0]
snps_chr = np.array(snps_chr)
snps_pos = np.array(snps_pos)
# ------ Get blocks ------
blocks = []
block_coord = [] #coordinates
for i_chr in range(1,(max(snps_chr)+1)):
if i_chr == 0:
break
idx_chr = (snps_chr == i_chr)
pos_max = max(snps_pos[idx_chr])
pos = snps_pos[idx_chr][0]
while (pos <= pos_max):
tmp = idx_chr & (snps_pos >= pos) & (snps_pos < pos+s_window)
pos_block = np.where(tmp)[0]
pos = pos + s_step
if len(pos_block) < 10:
continue
blocks += [pos_block]
block_coord += [[i_chr, round(pos - s_step/2)]]
with open(f'{path_blocks}block_coord.txt', 'w') as f:
for i_chr, pos in block_coord:
f.write(f'{i_chr}\t{pos}\n')
# ------ Read Frequencies ------
ilrs = []
pop_names = []
for p_region in pop_region:
for p_type in pop_type:
pattern = f'{path_estim}{p_region}_{movement_type}_{p_type}*'
files = glob.glob(pattern)
if len(files) == 0:
continue
pop_names += [f'{p_region}_{p_type[0]}']
freqs = np.array([[float(x) for x in open(f, 'r')] for f in files])
freqs[freqs >= 1 - tol] = 1 - tol
freqs[freqs < tol] = tol
ilr = np.log((1 - freqs) / freqs) # ilr
ilr = ilr.mean(axis=0) # mean over mcmc iterations
ilrs += [ilr]
n_pop = len(pop_names)
ilrs = np.array(ilrs)
# ------ Normalised ilr balances ------
ilrs_mean = np.mean(ilrs, axis=0)
m_fa = 1 / (1 + np.exp(ilrs_mean)) # geometric mean version
m_fa = np.sqrt( m_fa * (1 - m_fa))
ilrs_norm = ilrs * m_fa # multiplication, yes, it's true by formulas
# ------ Calculate distance matrices in windows ------
for ib, b in enumerate(blocks):
freqs_blocks = ilrs_norm[:, b]
mx = np.zeros((n_pop, n_pop))
for i, j in list(combinations(range(n_pop),2)):
mx[i, j] = mx[j, i] = np.linalg.norm(freqs_blocks[i] - freqs_blocks[j]) / np.sqrt(len(b))
mx = DataFrame(mx)
mx.index = pop_names
mx.columns = pop_names
mx.to_csv(f'{path_blocks}wnd_{ib+1}.txt', sep = '\t')