-
Notifications
You must be signed in to change notification settings - Fork 1
/
word-embeddings.py
101 lines (76 loc) · 3.03 KB
/
word-embeddings.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
import hnswlib
import numpy as np
from sentence_transformers import SentenceTransformer
import time
import csv
import matplotlib.pyplot as plt
import pickle
import os
import random
spaceName = "cosine"
wordList= []
count = 300000
def write_list(a_list):
with open(f'embeddings{count}.pickle','wb') as fp:
pickle.dump(a_list,fp)
print("Done writing to pickle file")
def read_list():
with open(f'embeddings{count}.pickle','rb') as fp:
return_list = pickle.load(fp)
return return_list
model = SentenceTransformer('all-MiniLM-L6-v2')
with open('unigram_freq.csv') as file:
for i in range(count):
line = file.readline()
line = line.split(",")[0]
wordList.append(line)
print(f"Succesfully loaded {count} words from Unigrams file into a list")
if os.path.isfile(f"embeddings{count}.pickle"):
#No need to make an embeddings.csv file
print("Loaded embeddings from saved pickle file")
embeddings = read_list()
else:
print("Creating new embeddings...")
start_time = time.time()
#sentences= ["I really like math","I really like art", "I really hate science", "I really hate math"]
embeddings = model.encode(wordList)
write_list(embeddings)
end_time = time.time()
print(f"Converted all words into embeddings in {end_time - start_time}")
dimension = embeddings.shape[1]
p = hnswlib.Index(space=spaceName,dim=dimension)
print("Starting to load HNSW graph...")
if os.path.isfile(f"HNSWmodel{count}"):
print("Loading model from file")
p.load_index(f"HNSWmodel{count}", max_elements = 400000)
else:
print("Creating new model...")
p.init_index(max_elements=400000,ef_construction=200,M=16) #What is ef_construction and M? Need to read initial paper
start_time = time.time()
#ef stands for link the number of neigbours per node
p.add_items(embeddings)
p.set_ef(50)
end_time = time.time()
p.save_index(f"HNSWmodel{count}")
print(f"Succesfully loaded model in {end_time - start_time} seconds")
new_word1 = "king"
new_word2 = "crown"
new_word1 = model.encode([new_word1])
new_word2 = model.encode([new_word2])
new_embedding = (new_word2 + new_word1)
print(new_word1[0][0])
print(new_word2[0][0])
print(new_embedding[0][0])
# Fetch k neighbours
start_time = time.time()
for i in range(100):
labels, distances = p.knn_query(new_embedding,k=1)
end_time = time.time()
elapsed_time = end_time - start_time
# The Elapsed time greatly differs, even with the same model, same data, and same query parameter
print(elapsed_time)
print(wordList[labels[0][0]])#HNSW algorithm gives
#Start from what graphs I want, then come from there.
#First graph, for each distance metric, each data point will be the time it takes to search in a vectoe database starting with 10,000, then adding 10,000 elements to the database.
#For each version of the model, determine the average time it takes to search up 100 randomly selected elements.
#Graph in millseconds on the y-axis, and number of elements in the HNSW model on the x-axis.