forked from rgoubin/participation_author_profiling
-
Notifications
You must be signed in to change notification settings - Fork 0
/
test_label.py
132 lines (106 loc) · 4.16 KB
/
test_label.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
from os.path import join, exists
from os import makedirs
from pickle import load
from time import time
from numpy import array
from shutil import rmtree
import operator
import re
import pickle
import numpy as np
from giovanniScripts.dataset_parser import parse_tweets_from_dir
from utils import abort_clean
from giovanniScripts.classifiers import get_classifier
from giovanniScripts.features import get_features_extr
from giovanniScripts.persistance import save_scores, save_model, save_author_file
from giovanniScripts.pipeline import get_pipeline
from giovanniScripts.utils import build_corpus, abort_clean, print_scores, format_dir_name
from giovanniScripts.utils import get_classifier_name, get_features_extr_name, get_labels
from giovanniScripts import clean_ar, clean_en, clean_es
import giovanniScripts.generic_features_text as generic
import giovanniScripts.specific_features_en as specific_en
import giovanniScripts.specific_features_es as specific_es
from sklearn.base import clone
from sklearn.metrics import confusion_matrix, f1_score
from sklearn.model_selection import KFold
from giovanniScripts import clean_en_txt_word2vec, clean_es_txt # Alaa : for tweet2vec
from tweet2vec import load_vectors, tweet2vec
options = {
'text_clf_path_tfidf': "./output_txt_train/tfidf",
'text_clf_path_label': './output_txt_train/label',
'text_clf_path_meta': './output_txt_train/meta',
'text_clf_path_bot': './output_txt_train/bot',
'text_clf_path_user2vec': './output_txt_train/user2vec'
}
def parse_gender_dict(truthFilePath):
with open(truthFilePath) as f:
content = f.readlines()
content = [x.strip() for x in content]
genders = dict()
# Female label is 0 ; Male label is 1
for author_info in content:
infos = author_info.split(':::')
current_author_gender = None
if(infos[1] == 'female'):
current_author_gender = 0
else:
current_author_gender = 1
genders[infos[0]] = current_author_gender
return genders
def parse_gender_dict_2019(truthFilePath):
with open(truthFilePath) as f:
content = f.readlines()
content = [x.strip() for x in content]
genders = dict()
# Female label is 0 ; Male label is 1
for author_info in content:
infos = author_info.split(':::')
current_author_gender = None
if infos[1] == 'bot':
current_author_gender = 2
else:
if(infos[2] == 'female'):
current_author_gender = 0
else:
current_author_gender = 1
genders[infos[0]] = current_author_gender
return genders
def predict(input_path, output_path, lang="es", verbosity_level=1):
Authors_full = parse_tweets_from_dir(
input_dir=format_dir_name(input_path + "/" + lang + "/"),
aggregation=1,
label=False,
verbosity_level=verbosity_level,
remove_URL_and_mention=True)
i = 0
Authors = []
while i < 100:
Authors.append(Authors_full[i])
i = i + 1
with open(options['text_clf_path_label'] + '/' + lang + '/label-classifier.p', "rb") as input_file:
clf_label = pickle.load(input_file)
generic_features_test = generic.all_generic_features(Authors)
specific_features_test = []
if lang == 'en':
specific_features_test = specific_en.get_all_specific_features(
Authors)
if lang == 'es':
specific_features_test = specific_es.get_all_specific_features_label(
Authors)
i = 0
for author in Authors:
print(i)
features = generic_features_test[i] + specific_features_test[i]
print(len(features))
prediction_author = clf_label.predict_proba([features])
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("-i", help="Path to the whole dataset")
parser.add_argument(
"-o", help="Path to save the result of the prediction as xml files")
parser.add_argument(
"-l", help="Path to save the result of the prediction as xml files")
args = parser.parse_args()
predict(input_path=args.i,
output_path=args.o, lang=args.l, verbosity_level=0)