-
Notifications
You must be signed in to change notification settings - Fork 0
/
data_util.py
145 lines (129 loc) · 4.84 KB
/
data_util.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
from collections import Counter
import nltk
from nltk import SnowballStemmer
import pandas as pd
from sklearn import metrics
def get_prfa(dev_y: list, pred_y: list, verbose=False) -> tuple:
"""
Calculate precision, recall, f1, and accuracy for a given set of predictions and labels.
Args:
dev_y: list of labels
pred_y: list of predictions
verbose: whether to print the metrics
Returns:
tuple of precision, recall, f1, and accuracy
"""
precision = metrics.precision_score(dev_y, pred_y)
recall = metrics.recall_score(dev_y, pred_y)
f1 = metrics.f1_score(dev_y, pred_y)
accuracy = metrics.accuracy_score(dev_y, pred_y)
if verbose:
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')
print(f'Accuracy: {accuracy}')
return precision, recall, f1, accuracy
def probs_to_preds(probabilities: list) -> list[int]:
"""
Converts continuous (sigmoid) outputs to discrete binary probabilities
:param probabilities: List of probabilities between [0, 1]
:return:
"""
return [1 if p[0] > 0.5 else 0 for p in probabilities]
def generate_tuples_from_df(df: pd.DataFrame) -> tuple[list[list[str]], list[int]]:
"""
Generates data from Pandas DataFrame in format:
tokenized text from file: [[word1, word2, ...], [word1, word2, ...], ...]
labels: [0, 1, 0, 1, ...]
:param df: The Pandas DataFrame
:return: A list of lists of tokens, list of integer labels
"""
X = []
y = []
for text, label in df.itertuples(index=False):
if len(text.strip()) == 0:
continue
else:
X.append(nltk.word_tokenize(text))
y.append(int(label))
return X, y
def generate_tuples_from_file(training_file_path: str) -> tuple[list[list[str]], list[int]]:
"""
Generates data from file formatted like:
tokenized text from file: [[word1, word2, ...], [word1, word2, ...], ...]
labels: [0, 1, 0, 1, ...]
Parameters:
training_file_path - Path to the training file(s)
Return:
A list of lists of tokens and a list of int labels
"""
if training_file_path.endswith(".csv"):
X = []
y = []
training_df = pd.read_csv(training_file_path)
for text, label in training_df.itertuples(index=False):
if len(text.strip()) == 0:
continue
else:
X.append(nltk.word_tokenize(text))
y.append(int(label))
return X, y
else:
training_file = open(training_file_path, "r", encoding="utf8")
X = []
y = []
for sentence in training_file:
if len(sentence.strip()) == 0:
continue
data_in_sentence = sentence.strip().split("\t")
if len(data_in_sentence) != 3:
continue
else:
t = tuple(data_in_sentence)
if (not t[2] == '0') and (not t[2] == '1'):
print("WARNING")
continue
X.append(nltk.word_tokenize(t[1]))
y.append(int(t[2]))
training_file.close()
return X, y
def create_vocabulary(training_data_X: list) -> list:
"""
Given the training data, create a list of all the words in the training data.
Args:
training_data_X: a list of all the training data in the format [[word1, word2, ...], ...]
Returns:
vocab: a list of all the unique words in the training data
"""
vocabulary = set()
stemmer = SnowballStemmer("english")
for document in training_data_X:
for word in document:
vocabulary.add(stemmer.stem(word))
return list(vocabulary)
def featurize(vocab: list, data_to_be_featurized_X: list, binary: bool = False, verbose: bool = False) -> list:
"""
Create vectorized BoW representations of the given data.
Args:
vocab: list of words in vocabulary
data_to_be_featurized_X: a list of data to be featurized in the format [[word1, word2, ...], ...]
binary: whether to use binary features
verbose: boolean for whether to print out progress
Returns:
a list of sparse vector representations of the data in the format [[count1, count2, ...], ...]
"""
X = []
for document in data_to_be_featurized_X:
word_counts = Counter(document)
document_features = {}
for word in vocab:
if binary:
# Convert true or false to integer
document_features[word] = int(word in word_counts)
else:
# If not binary, get counts directly and default to 0
document_features[word] = word_counts.get(word, 0)
if verbose:
print(f'Add {word} with {document_features[word]}')
X.append(document_features)
return X