-
Notifications
You must be signed in to change notification settings - Fork 1
/
extraction.py
172 lines (136 loc) · 5.01 KB
/
extraction.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
#!/usr/local/env python
"""
==========================
News option extraction
==========================
Usage
-----
$ python extraction.py ../../data/ltp_data_v3.4.0 \
../../data/corpus/sqlResult_1558435.csv \
../../data/corpus/news-model.model \
../../data/news/options.csv
"""
import os
import sys
import logging
import argparse
import pandas as pd
from pyltp import Parser
from pyltp import Segmentor
from pyltp import Postagger
from pyltp import NamedEntityRecognizer
from gensim.models.word2vec import Word2Vec
FORMAT = '[%(levelname)s]: %(message)s'
logging.basicConfig(format=FORMAT)
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
class NewsOptionExtractor:
def __init__(self, ltp_path, news_path, news_model, outpath):
# Initialize
logger.info('Initializing...')
self.outpath = outpath
LTP_DATA_DIR = ltp_path # ltp模型目录的路径
cws_model_path = os.path.join(LTP_DATA_DIR, 'cws.model')
pos_model_path = os.path.join(LTP_DATA_DIR, 'pos.model')
ner_model_path = os.path.join(LTP_DATA_DIR, 'ner.model')
par_model_path = os.path.join(LTP_DATA_DIR, 'parser.model')
self.segmentor = Segmentor()
self.segmentor.load(cws_model_path)
self.postagger = Postagger()
self.postagger.load(pos_model_path)
self.recognizer = NamedEntityRecognizer()
self.recognizer.load(ner_model_path)
self.parser = Parser()
self.parser.load(par_model_path)
# Read news data
df = pd.read_csv(news_path, encoding='gb18030')
self.contents = df[~df['content'].isnull()]['content']
# Word2Vec Model
NEWS_WORD2VEC_MODEL_PATH = news_model
self.news_model = Word2Vec.load(NEWS_WORD2VEC_MODEL_PATH)
def get_similar_words(self, word):
words = dict(self.news_model.wv.most_similar(word))
words[word] = 1
return words
def get_all_words_related_to(words, arcs, index):
return [words[i] for i, arc in enumerate(arcs) if arc.head == index]
def is_sentence_end(self, w):
return w in ["。", "!", "!"]
def print_option(self, option):
logger.debug('{} {} {}'.format(option['name'], option['say'], option['sentence']))
def save_result(self, options):
df = pd.DataFrame(options)
df.to_csv(self.outpath)
def process(self):
logger.info('Start to process contents...')
options = []
total = len(self.contents)
for i, content in enumerate(self.contents):
print('\rProccessing news {}/{}...'.format(i, total), end='', flush=True)
# 1. split words
words = self.segmentor.segment(content.replace('\r\n', ''))
# 2. POS
postags = self.postagger.postag(words)
# 3. NER
netags = self.recognizer.recognize(words, postags)
# 4. Extract '说' and similar words from content
say_words_dict = self.get_similar_words('说')
say_positions = [(w, i) for i, w in enumerate(words) if w in say_words_dict]
# 5. NER - extract all the names from the content
all_names_positions = [(i, tag) for i, tag in enumerate(netags) if 'Nh' in tag]
all_names = [(words[name[0]], name[0]) for name in all_names_positions]
names = set(map(lambda x: x[0], all_names))
# 6. 依存句法分析
# arcs = self.parser.parse(words, postags)
for say_word, pos in say_positions:
option = {'say': say_word}
# Get the name who say the words
for i in range(pos, 0, -1):
w = words[i]
if w in names:
option['name'] = w
break
if 'name' not in option:
# No name extracted
continue
sentence = ''
for w in words[pos + 1:]:
sentence += w
if self.is_sentence_end(w):
option['sentence'] = sentence
break
if 'sentence' not in option:
continue
options.append(option)
# self.print_option(option)
self.save_result(options)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument(
'ltppath',
type=str,
help='LTP Model path'
)
parser.add_argument(
'newspath',
type=str,
help='News path'
)
parser.add_argument(
'newsmodel',
type=str,
help='News word2vec model path'
)
parser.add_argument(
'outpath',
type=str,
help='Options output csv path'
)
parser.add_argument(
'--verbose',
action='store_true',
help='Verbose mode'
)
args = parser.parse_args(sys.argv[1:])
extractor = NewsOptionExtractor(args.ltppath, args.newspath, args.newsmodel, args.outpath)
extractor.process()