-
Notifications
You must be signed in to change notification settings - Fork 1
/
preprocessing.py
81 lines (46 loc) · 1.6 KB
/
preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
#!/usr/bin/env python
# coding: utf-8
# ### Reading the input file and creating a clean one
# Note: only run once
# In[1]:
import shutil
import re
import os
path1 = "./Movie_Poster_Metadata/groundtruth"
temp_path = "./Movie_Poster_Metadata/temp_groundtruth"
path2 = "./Movie_Poster_Metadata/updated_groundtruth"
# In[3]:
def reconstuct_metadata():
dir_list = os.listdir(path1)
if not os.path.exists(temp_path):
os.makedirs(temp_path)
if not os.path.exists(path2):
os.makedirs(path2)
else:
print("directories already exist. Not cleaning metadata")
return None
for file_name in dir_list:
with open(path1+'/'+file_name,'r',encoding='utf-16-le') as file1:
temp_file = open(temp_path+'/'+file_name,'w',encoding='utf-8')
for line in file1.readlines():
line = line.replace("}\n","},\n")
# reading all lines that begin with " "_id""
y = re.findall("^ \"_id\"", line)
if not y:
temp_file.write(line)
file1.close()
temp_file.close()
dir_list = os.listdir(temp_path)
for file_name in dir_list:
with open(temp_path+'/'+file_name,'r',encoding='utf-8') as temp_file:
file2 = open(path2+'/'+file_name,'w',encoding='utf-8')
lines = temp_file.readlines()
lines = lines[1:-1]
file2.write("[{")
file2.writelines(lines)
file2.write("}]")
temp_file.close()
file2.close()
shutil.rmtree(temp_path)
return None
# In[ ]: