forked from rgoubin/participation_author_profiling
-
Notifications
You must be signed in to change notification settings - Fork 0
/
predict.py
247 lines (196 loc) · 8.71 KB
/
predict.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
from os.path import join, exists
from os import makedirs
from pickle import load
from time import time
from numpy import array
from shutil import rmtree
import operator
import re
import pickle
import numpy as np
from giovanniScripts.dataset_parser import parse_tweets_from_dir
from utils import abort_clean
from giovanniScripts.classifiers import get_classifier
from giovanniScripts.features import get_features_extr
from giovanniScripts.persistance import save_scores, save_model, save_author_file
from giovanniScripts.pipeline import get_pipeline
from giovanniScripts.utils import build_corpus, abort_clean, print_scores, format_dir_name
from giovanniScripts.utils import get_classifier_name, get_features_extr_name, get_labels
from giovanniScripts import clean_ar, clean_en, clean_es
import giovanniScripts.generic_features_text as generic
import giovanniScripts.specific_features_en as specific_en
import giovanniScripts.specific_features_es as specific_es
from sklearn.base import clone
from sklearn.metrics import confusion_matrix, f1_score
from sklearn.model_selection import KFold
from giovanniScripts import clean_en_txt_word2vec, clean_es_txt # Alaa : for tweet2vec
from tweet2vec import load_vectors, tweet2vec
options = {
'text_clf_path_tfidf': "./output_txt_train/tfidf",
'text_clf_path_label': './output_txt_train/label',
'text_clf_path_meta': './output_txt_train/meta',
'text_clf_path_bot': './output_txt_train/bot',
'text_clf_path_user2vec': './output_txt_train/user2vec'
}
def parse_gender_dict(truthFilePath):
with open(truthFilePath) as f:
content = f.readlines()
content = [x.strip() for x in content]
genders = dict()
# Female label is 0 ; Male label is 1
for author_info in content:
infos = author_info.split(':::')
current_author_gender = None
if infos[1] == 'bot':
current_author_gender = 2
else:
if(infos[2] == 'female'):
current_author_gender = 0
else:
current_author_gender = 1
genders[infos[0]] = current_author_gender
return genders
def save_xmls(output_path, lang, predictions_dict):
if exists(output_path) == False:
makedirs(output_path)
for prediction in predictions_dict:
author = dict()
author['id'] = prediction
author['lang'] = lang
if predictions_dict[prediction] == 'bot':
author['type'] = 'bot'
else:
author['type'] = 'human'
author['gender'] = predictions_dict[prediction]
save_author_file(
author=author, output_dir=output_path + '/', verbose=0)
def predict(input_path, output_path, verbosity_level=1):
'''
For each language, proceeds as follow:
- takes in input the corresponding .pkl file
- train a text-based classifier on the 80% split
- save the resulting model in outputPath
:param inputPath: Path to PAN19 dataset
:param splitsPath: Path to dir containing the .pkl files produced by 'splitting.py'
:param outputPath: Path to dir in which the outputs models will be saved
NB. Create outputPath directory before using this function
'''
#for lang in ['es', 'en']:
for lang in ['en']:
input_dir = join(input_path, lang)
output_dir = join(output_path, lang)
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.calibration import CalibratedClassifierCV
print("Extracting Authors:")
Authors = parse_tweets_from_dir(
input_dir=format_dir_name(input_path + "/" + lang + "/"),
aggregation=1,
label=False,
verbosity_level=verbosity_level,
remove_URL_and_mention=True)
predictions_dict = dict()
Bots = []
Humans = []
# TO DELETE
#Humans = Authors
Bots = Authors
# -----------------------------------------------------
# ------ DETERMINING IF USERS ARE BOTS OR HUMANS ------
# -----------------------------------------------------
print('Get bot classifier')
clf_bot = None
with open(options['text_clf_path_bot'] + '/' + lang + '/bot-classifier.p', "rb") as input_file:
clf_bot = pickle.load(input_file)
print('--------------- feature extractor ------------------')
bot_features_test = generic.all_generic_bot_features(Bots, lang)
print('--------------- feature extracted ------------------')
bot_predictions_label = dict()
print('--------------- construction of features for the bot classifier ------------------')
i = 0
for author in Bots:
print(i)
print(bot_features_test[i])
prediction_author = clf_bot.predict_proba([bot_features_test[i]])
if prediction_author[0][1] >= 0.5:
# predictions_dict[author['id']] = 'human'
Humans.append(author)
else:
predictions_dict[author['id']] = 'bot'
i = i + 1
print("--------------- bot prediction done ---------------")
#save_xmls(output_path + '/' + lang, lang, predictions_dict)
# -----------------------------------------------------
# --- DETERMINING IF THE HUMANS ARE FEMALE OR MALE ----
# -----------------------------------------------------
print('Get classifiers label, meta and user2vec')
clf_label = None
clf_meta = None
clf_user2vec = None
word2vec_model = None
import text_prediction
predictions_test_tfidf = text_prediction.predict(
input_path, options['text_clf_path_tfidf'], languages=[lang])
print('--------------- feature extractor ------------------')
with open(options['text_clf_path_label'] + '/' + lang + '/label-classifier.p', "rb") as input_file:
clf_label = pickle.load(input_file)
with open(options['text_clf_path_meta'] + '/' + lang + '/meta-classifier.p', "rb") as input_file:
clf_meta = pickle.load(input_file)
generic_features_test = generic.all_generic_features(Humans)
specific_features_test = []
if lang == 'en':
specific_features_test = specific_en.get_all_specific_features(
Humans)
if lang == 'es':
specific_features_test = specific_es.get_all_specific_features_label(
Humans)
print('--------------- feature extracted ------------------')
text_predictions_label = dict()
print('--------------- construction of features for the meta classifier ------------------')
X_test = dict()
i = 0
for author in Humans:
print(i)
features = generic_features_test[i] + specific_features_test[i]
prediction_author = clf_label.predict_proba([features])
text_predictions_label[author['id']] = prediction_author[0]
i = i + 1
toAppend = []
toAppend.append(predictions_test_tfidf[author['id']][0])
toAppend.append(predictions_test_tfidf[author['id']][1])
toAppend.append(prediction_author[0][0])
toAppend.append(prediction_author[0][1])
'''if lang == 'en':
toAppend.append(prediction_user2vec[author['id']][0])
toAppend.append(prediction_user2vec[author['id']][1])'''
X_test[author['id']] = toAppend
X_test_casted = dict()
for author in Humans:
x = X_test[author['id']]
toAppend = []
for feature in x:
toAppend.append(float(feature))
X_test_casted[author['id']] = toAppend
print('--------------- meta classifier predictions ------------------')
i = 0
for author in Humans:
prediction_author = clf_meta.predict_proba(
[X_test_casted[author['id']]])
if prediction_author[0][0] >= 0.5:
predictions_dict[author['id']] = 'female'
else:
predictions_dict[author['id']] = 'male'
i = i + 1
print('--------------- meta classifier predictions done ------------------')
print('--------------- saving ------------------')
save_xmls(output_path + '/' + lang, lang, predictions_dict)
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("-i", help="Path to the whole dataset")
parser.add_argument(
"-o", help="Path to save the result of the prediction as xml files")
args = parser.parse_args()
predict(input_path=args.i,
output_path=args.o, verbosity_level=0)