Skip to content
This repository has been archived by the owner on Aug 27, 2024. It is now read-only.

Commit

Permalink
Batching and insert performance fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
AlexArtrip committed Aug 17, 2023
1 parent 647b05d commit c279fd4
Show file tree
Hide file tree
Showing 16 changed files with 367 additions and 266 deletions.
10 changes: 5 additions & 5 deletions integration-tests/corpus/test_basic_corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,13 @@ def test_save_then_search_one_corpus(es_client):
text2 = "I picked up my phone and then dropped it again. I cant seem to get a good grip on things these days. It persists into my everyday tasks"
text3 = "The weather is great today, but I worry that tomorrow it won't be. My umbrella is in the repair shop."

assert test_corpus.store_and_index(text1, "doc1", Citation("www.docsource1", "SSSdoc1", corpus_name, ""))
assert test_corpus.store_and_index(text2, "doc2", Citation("were.docsource2", "SSSdoc2", corpus_name, ""))
assert test_corpus.store_and_index(text3, "doc3", Citation("docsource3.ai", "SSSdoc3", corpus_name, ""))
assert test_corpus.store_and_index(text1, "doc1", Citation("www.docsource1", "SSSdoc1", ""))
assert test_corpus.store_and_index(text2, "doc2", Citation("were.docsource2", "SSSdoc2", ""))
assert test_corpus.store_and_index(text3, "doc3", Citation("docsource3.ai", "SSSdoc3", ""))

time.sleep(1)
output = test_corpus.search("It is sunny")
# print("OUTPUT IS : ")
# print(output)
assert "sunshine" in output[0][1]
assert "weather" in output[1][1]
assert "sunshine" in output[1][0]
assert "weather" in output[0][0]
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,6 @@ def test_insert_and_get():
corpus_id = uuid.uuid4()
document_id = uuid.uuid4()

citation = Citation("google.com", "test google", "googlrCorpus", "just a simple test")
citation = Citation("google.com", "test google", "just a simple test")
metadata.insert_document_metadata(corpus_id, document_id, 1, "test", citation)
assert metadata.get_document_citation(corpus_id, document_id) == citation
14 changes: 7 additions & 7 deletions integration-tests/storage_driver/test_corpus_doc_store.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,16 +27,16 @@ def test_save_then_search(es_client):
chunk_id2 = document_id2.hex + '{:032b}'.format(0)
chunk_id3 = document_id3.hex + '{:032b}'.format(0)

assert doc_store.save_document(chunk_id1, DocumentEntity(
corpus_id1, document_id1, "test1", "MeMaS is great and easy to use"))
assert doc_store.save_document(chunk_id2, DocumentEntity(
corpus_id2, document_id2, "test2", "MeMaS is coded in python and is horrible"))
assert doc_store.save_document(chunk_id3, DocumentEntity(
corpus_id1, document_id3, "test3", "Memory Management System"))
assert doc_store.save_documents([(chunk_id1, DocumentEntity(
corpus_id1, document_id1, "test1", "MeMaS is great and easy to use"))])
assert doc_store.save_documents([(chunk_id2, DocumentEntity(
corpus_id2, document_id2, "test2", "MeMaS is coded in python and is horrible"))])
assert doc_store.save_documents([(chunk_id3, DocumentEntity(
corpus_id1, document_id3, "test3", "Memory Management System"))])

time.sleep(2)

result = doc_store.search_corpus(corpus_id1, "memas is horrible")
result = doc_store.search_corpora([corpus_id1], "memas is horrible")

# check that we only found document 1
assert len(result) == 1
Expand Down
16 changes: 8 additions & 8 deletions integration-tests/storage_driver/test_corpus_vector_store.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,16 +22,16 @@ def test_save_then_search2():

best_match_str = "The sun is high. California sunshine is great. "

store.save_document(DocumentEntity(corpus_id1, document_id0, "doc0",
"Before This is a runon sentence meant to test the logic of the splitting capabilites but that is only the start, there is nothing that can break this sentecne up other than some handy logic even in the worst case, too bad I only know how to use commas"))
store.save_document(DocumentEntity(corpus_id1, document_id1, "doc1",
"The sun is high! California sunshine is great. Did you catch my quest? Oh oh! lol"))
store.save_document(DocumentEntity(corpus_id1, document_id2, "doc2",
"I picked up my phone and then dropped it again"))
store.save_document(DocumentEntity(corpus_id2, document_id3, "doc3", "The weather is great today"))
store.save_documents([DocumentEntity(corpus_id1, document_id0, "doc0",
"Before This is a runon sentence meant to test the logic of the splitting capabilites but that is only the start, there is nothing that can break this sentecne up other than some handy logic even in the worst case, too bad I only know how to use commas")])
store.save_documents([DocumentEntity(corpus_id1, document_id1, "doc1",
"The sun is high! California sunshine is great. Did you catch my quest? Oh oh! lol")])
store.save_documents([DocumentEntity(corpus_id1, document_id2, "doc2",
"I picked up my phone and then dropped it again")])
store.save_documents([DocumentEntity(corpus_id2, document_id3, "doc3", "The weather is great today")])
time.sleep(1)

result = store.search(corpus_id1, "How's the weather today?")
result = store.search_corpora([corpus_id1], "How's the weather today?")

# assert False
# Test that the text recovered for a short sentence matched the expected length
Expand Down
100 changes: 19 additions & 81 deletions memas/corpus/basic_corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,12 @@
from memas.interface.storage_driver import DocumentEntity
from memas.interface.exceptions import SentenceLengthOverflowException
from memas.context_manager import ctx
from memas.corpus.corpus_helpers import segment_document


_log = logging.getLogger(__name__)

from memas.text_parsing.text_parsers import segment_document
from memas.corpus.corpus_searching import normalize_and_combine

MAX_SEGMENT_LENGTH = 1536

_log = logging.getLogger(__name__)

class BasicCorpus(Corpus):

Expand All @@ -32,20 +30,27 @@ def store_and_index(self, document: str, document_name: str, citation: Citation)
doc_id = uuid.uuid4()
doc_entity = DocumentEntity(self.corpus_id, doc_id, document_name, document)

ctx.corpus_vec.save_document(doc_entity)
document_chunks = segment_document(document, MAX_SEGMENT_LENGTH)

# TODO : Need to investigate how to undo when failures on partial insert
meta_save = ctx.corpus_metadata.insert_document_metadata(self.corpus_id, doc_id, len(document_chunks), document_name, citation)

vec_save = ctx.corpus_vec.save_documents([doc_entity])

# Divide longer documents for document store
document_chunks = segment_document(document, MAX_SEGMENT_LENGTH)
chunk_num = 1
chunk_num = 0
chunk_id_entity_pairs = []
for chunk in document_chunks:
# Create the new IDs for the document chunk combo
chunkID = doc_id.hex + '{:032b}'.format(chunk_num)
chunk_id = doc_id.hex + '{:032b}'.format(chunk_num)
chunk_num = chunk_num + 1
doc_chunk_entity = DocumentEntity(self.corpus_id, doc_id, document_name, chunk)
ctx.corpus_doc.save_document(chunkID, doc_chunk_entity)
chunk_id_entity_pairs.append((chunk_id, doc_chunk_entity))

# Insert all chunks of document at once
doc_save = ctx.corpus_doc.save_documents(id_doc_pairs=chunk_id_entity_pairs)

# TODO: Need to redo this return to be indicative of complete success
return ctx.corpus_metadata.insert_document_metadata(self.corpus_id, doc_id, chunk_num, document_name, citation)
return meta_save and vec_save and doc_save

"""
The most basic search of a document store via Elastic Search and a Vector DB
Expand All @@ -60,17 +65,16 @@ def search(self, clue: str) -> list[tuple[float, str, Citation]]:
vector_search_count: int = 10

doc_store_results: list[tuple[float, str, Citation]] = []
temp_res = ctx.corpus_doc.search_corpus(self.corpus_id, clue)
temp_res = ctx.corpus_doc.search_corpora([self.corpus_id], clue)
# Search the document store
for score, doc_entity in temp_res:
document_text = doc_entity.document
citation = ctx.corpus_metadata.get_document_citation(self.corpus_id, doc_entity.document_id)

doc_store_results.append([score, document_text, citation])

# Search for the vectors
vec_store_results: list[tuple[float, str, Citation]] = []
temp_res2 = ctx.corpus_vec.search(self.corpus_id, clue)
temp_res2 = ctx.corpus_vec.search_corpora([self.corpus_id], clue)
for score, doc_entity, start_index, end_index in temp_res2:

# Verify that the text recovered from the vectors fits the maximum sentence criteria
Expand All @@ -79,13 +83,8 @@ def search(self, clue: str) -> list[tuple[float, str, Citation]]:
raise SentenceLengthOverflowException(end_index - start_index)

citation = ctx.corpus_metadata.get_document_citation(self.corpus_id, doc_entity.document_id)

vec_store_results.append([score, doc_entity.document, citation])

# print("Docs then Vecs : ")
# print(doc_store_results)
# print(vec_store_results)

# If any of the searches returned no results combine and return
if len(vec_store_results) == 0:
doc_store_results.sort(key=lambda x: x[0], reverse=True)
Expand All @@ -103,67 +102,6 @@ def generate_search_instructions(self, clue: str) -> any:
pass


def normalize_and_combine(doc_results: list, vec_results: list):
# normalization with assumption that top score matches are approximately equal

# Vec scores are based on distance, so smaller is better. Need to inverse the
# order to be comparable to something like elastic search where bigger is better.
doc_scores = ([x[0] for x in doc_results])
vec_scores = ([x[0] for x in vec_results])

doc_max_score = max(doc_scores)
doc_min_score = min(doc_scores)

vec_max_score = max(vec_scores)
vec_min_score = min(vec_scores)

doc_results_normalized = []
vec_results_normalized = []

# Normalize and shift doc results to be between 0 and 1, with 1 being best responses and 0 being worst
if (doc_max_score != doc_min_score):
doc_results_normalized = [[(x - doc_min_score) / (doc_max_score - doc_min_score), y, z]
for [x, y, z] in doc_results]

# Vector results assume L2 distance of unit vectors so the range is between 0 and 2.
# if(vec_max_score != vec_min_score) :
# vec_results_normalized = [[(vec_max_score - x) / (vec_max_score - vec_min_score), y, z]
# for [x, y, z] in vec_results]
vec_results_normalized = [[2 - x, y, z] for [x, y, z] in vec_results]

# Reward documents that contain high scoring vectors and remove the searched vector.

# TODO : Check that this isn't super slow for larger documents and more search results

# Was considering adjusting the score reward by the document length when a document
# has a vector within it. Idea was longer docs share more sentences, so they're over rewarded.
# That might just mean its a good document, so it should recieve that score increase.
# avg_doc_len = reduce(lambda x, y : len(x[1]) + len(y[1]), doc_results_normalized)

duplicate_vec_indicies = []
doc_index = 0
for doc_score, doc_text, doc_citation in doc_results_normalized:
vec_index = 0

for vec_score, vec_text, vec_citation in vec_results_normalized:
if (vec_text in doc_text):
duplicate_vec_indicies.append(vec_index)
doc_score = doc_score + vec_score
vec_index = vec_index + 1

doc_results_normalized[doc_index][0] = doc_score
doc_index = doc_index + 1

unique_vectors = [i for j, i in enumerate(vec_results_normalized) if j not in duplicate_vec_indicies]

doc_results_normalized.extend(unique_vectors)

# Sort by descending scoring so best results come first
doc_results_normalized.sort(key=lambda x: x[0], reverse=True)

return doc_results_normalized


class BasicCorpusFactory(CorpusFactory):
def produce(self, corpus_id: uuid.UUID):
# TODO: Maybe change the Corpus Name Parameter
Expand Down
48 changes: 0 additions & 48 deletions memas/corpus/corpus_helpers.py

This file was deleted.

Loading

0 comments on commit c279fd4

Please sign in to comment.