-
Notifications
You must be signed in to change notification settings - Fork 0
/
cnn_vary.py
150 lines (120 loc) · 5.84 KB
/
cnn_vary.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
import numpy as np
import pandas as pd
import torch
import matplotlib.pyplot as plt
from cnn import ConvolutionalNet, vecToMatEncoding
# Nucleotide BDM, GC147 and NuPoP (Affinity) are identified as important
# computed nucleosome organization-related scores. Thus, in this Python script,
# we vary the values of these scores, and see how such varied values
# affect XGBoost's predictions.
# This Python script uses the following data and model files:
# 1. An input CSV file containing 2000 random experimental/augmented datapoints
# from the crisprSQL dataset in the paper
csv_loc = 'data/crisprSQL_dataset_2000.csv'
# 2. PyTorch model file containing the convolutional neural network (CNN) model
# which predicts CRISPR-Cas9 (off-)target cleavage activity.
state_dict_loc = 'models/cnn_model.pt'
# The plots generated will be saved in these relative locations
out_dict = {'Nucleotide BDM': 'out/cnn_vary_nucleotide_bdm.pdf',
'GC147': 'out/cnn_vary_gc147.pdf',
'NuPoP (Affinity)': 'out/cnn_vary_nupop_affinity.pdf'}
### IMPORTANT CONSTANTS ###
# Don't change these!
TAR_SEQ_LEN = 23 # target DNA sequence's length
epiDim = 22 # The total number of epigenetic features considered
numBpWise = 13 # Number of base pair-resolved nucleosome organization-related
# scores/features
# Feel free to change
NUM_SAMPLES = 5 # Number of random datapoints to select for visualization,
# i.e., number of row to select from the input CSV file
NUM_VARIED = 20 # Number of varied points per random datapoint
###########################
# Read the input CSV file
X_df = pd.read_csv(csv_loc)
# Decide which compute device to use when using PyTorch.
# Use a GPU if available, CPU otherwise.
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# Create convolutional neural network regression PyTorch model
torch_model = ConvolutionalNet(epiDim, device=device)
# Load saved PyTorch weights
state_dict = torch.load(state_dict_loc, map_location=device)
torch_model.load_state_dict(state_dict)
# These are the column/feature names corresponding to ...
# (a) Nucleotide BDM
NBDM_feats = ['NucleotideBDM_' + str(i) for i in range(1, 1 + TAR_SEQ_LEN)]
# (b) GC147
GC_feats = ['GCContent_' + str(i) for i in range(1, 1 + TAR_SEQ_LEN)]
# (c) NuPoP (Affinity)
NuPoP_Aff_feats = ['NuPoP_Affinity_147_h'] + \
['NuPoP_Affinity_147_h.' + str(i) for i in range(1,
TAR_SEQ_LEN)]
# Put these features into a dictionary
feats_dict = {'Nucleotide BDM': NBDM_feats,
'GC147': GC_feats,
'NuPoP (Affinity)': NuPoP_Aff_feats}
# To obtain sensible value ranges, we exclude data points with zeros across
# all computed nucleosome organization-related score, since these zeros arise
# from data imputation during training.
non_zero_X_df = X_df[X_df['GCContent_1'] > 0]
# Set values ranges for each computed score to be within the minimum and
# maximum value of the respective features, i.e., we vary within the ranges
# of the dataset.
ranges_dict = {'Nucleotide BDM': (non_zero_X_df[NBDM_feats].min(axis=1).min(),
non_zero_X_df[NBDM_feats].max(axis=1).max()),
'GC147': (non_zero_X_df[GC_feats].min(axis=1).min(),
non_zero_X_df[GC_feats].max(axis=1).max()),
'NuPoP (Affinity)': (non_zero_X_df[NuPoP_Aff_feats].min(axis=1).min(),
non_zero_X_df[NuPoP_Aff_feats].max(axis=1).max())}
# We select five random data points...
X_sample = X_df.sample(NUM_SAMPLES)
# Generate plots for each of the three computed scores
for feat_name, feats in feats_dict.items():
min_val, max_val = ranges_dict[feat_name]
print("{}'s value ranges from {:.2f} to {:.2f}".format(feat_name,
min_val, max_val))
# For each data point...
for i in range(NUM_SAMPLES):
point_chosen = X_sample.iloc[i]
# Create a Pandas dataframe for the random datapoint
example_df = pd.DataFrame(point_chosen).transpose()
# Determine the varied values to change to
varied_vals = np.linspace(min_val, max_val, NUM_VARIED)
# Create the artificial datapoints containing the varied values
vary_df = example_df.loc[example_df.index.repeat(len(varied_vals))]
for feat in feats:
vary_df[feat] = varied_vals
# # Make predictions for these artificial datapoints
# preds = xgb_model.predict(vary_df)
# Convert the Pandas DataFrame into a PyTorch tensor
X_torch = torch.as_tensor(vary_df.to_numpy()).float()
# Format the PyTorch tensor to fit the CNN's input dimensions
x = vecToMatEncoding(X_torch, numBpWise=13)
# Prepare for model inference by setting torch_model to evaluation mode
torch_model.eval()
# Make CRISPR-Cas9 cleavage activity predictions for these artificial datapoints
preds = torch_model(x).cpu().flatten().detach().numpy()
# Plot the line
plt.plot(varied_vals, preds)
# Set...
# the plot's minimum and maximum x-value, ...
plt.xlim(min_val, max_val)
# the x-axis's name, and
plt.xlabel("Varied {} Value".format(feat_name))
# the y-axis's name
plt.ylabel("Predicted CRISPR-Cas9 Cleavage Activity")
# This is not a Jupyter notebook, so we won't show the figure here
# plt.show()
# Save the plot
plt.savefig(out_dict[feat_name])
# Close the current figure
plt.close()
# The script prints:
# Nucleotide BDM's value ranges from 172.66 to 480.48
# GC147's value ranges from 0.25 to 0.86
# NuPoP (Affinity)'s value ranges from 0.50 to 2.13
# We observe the following tendencies in the plots:
# - increasing Nucleotide BDM increases predicted CRISPR-Cas9 cleavage
# activity value
# - increasing GC147 increases predicted CRISPR-Cas9 cleavage activity value
# - increasing NuPoP (Affinity) decreases predicted CRISPR-Cas9 cleavage
# activity value