-
Notifications
You must be signed in to change notification settings - Fork 3
/
Proccess.py
114 lines (94 loc) · 3.53 KB
/
Proccess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
"""A set of tools to process files"""
import logging
import multiprocessing
from pathlib import Path
import glob
import random
import signal
import sys
import numpy as np
from FeatureExtract import extract
from Config import _NB_LINES , _NB_FILES_MIN
LOGGER = logging.getLogger(__name__)
# generate pseudo random number every time
random.seed()
'''
Check if the source is having the extentions for which you want to train your neural network:
If yes: then count and check if it is having sufficient number of files
i.e. _NB_FILES_MIN. Default value = 10
if no: then exit saying "Too few source files"
'''
def search_files(source, extensions):
print (extensions)
files = [
path for path in Path(source).glob('*/*')
if path.is_file() and path.suffix.lstrip('.') in extensions]
nb_files = len(files)
# LOGGER.debug("Total files found: %d", nb_files)
print ("Total files found: ", nb_files)
if nb_files < _NB_FILES_MIN:
# LOGGER.error("Too few source files")
print ("Too few source files (<",_NB_FILES_MIN,").")
sys.exit()
random.shuffle(files)
return files
'''
Explaination already given.
'''
def extract_from_files(files, languages):
"""Extract arrays of features from the given files.
:param list files: list of filenames
:param dict languages: language name => associated file extension list
:return: features
:rtype: tuple
"""
enumerator = enumerate(sorted(languages.items()))
rank_map = {ext: rank for rank, (_, exts) in enumerator for ext in exts}
with multiprocessing.Pool(initializer=_process_init) as pool:
file_iterator = ((path, rank_map) for path in files)
arrays = _to_arrays(pool.starmap(_extract_features, file_iterator))
# LOGGER.debug("Extracted arrays count: %d", len(arrays[0]))
print ("Extracted arrays count:", len(arrays[0]))
return arrays
'''
In order to do multiprocessing, we need main process to be running.
If it is aborted or somehow killed. Then subprocess must also be killed
in order to same resources.
'''
def _process_init():
# Stop the subprocess silently when the main process is killed
signal.signal(signal.SIGINT, signal.SIG_IGN)
'''
Check if the file presnt on the path has the extention which is present in the rank map:
if yes : the read and return the number of lines i.e. _NB_LINES along with the rank
else : currently we are ignoring that. In future we can extend it as we get wider range of extentions, maybe.
'''
def _extract_features(path, rank_map):
ext = path.suffix.lstrip('.')
rank = rank_map.get(ext)
if rank is None:
pass
content = read_file(path)
content = '\n'.join(content.splitlines()[:_NB_LINES])
return [extract(content), rank]
'''
Return the np.array form of ranks and content_vectors presemt in features
'''
def _to_arrays(features):
# Flatten and split the dataset
ranks = []
content_vectors = []
for content_vector, rank in features:
ranks.append(rank)
content_vectors.append(content_vector)
# Convert lists into numpy arrays
return (np.array(content_vectors), np.array(ranks))
'''
On the defined file_path, read the text inside the file if it is in utf-8 format.
'''
def read_file(file_path):
# read the files in 'utf-8' encoding; otherwise drop the file
try:
return file_path.read_text(encoding='utf-8')
except UnicodeError:
pass # Ignore encoding error