Batching and insert performance fixes

memas-ai · Aug 17, 2023 · c279fd4 · c279fd4
1 parent 647b05d
commit c279fd4
Show file tree

Hide file tree

Showing 16 changed files with 367 additions and 266 deletions.
diff --git a/integration-tests/corpus/test_basic_corpus.py b/integration-tests/corpus/test_basic_corpus.py
@@ -16,13 +16,13 @@ def test_save_then_search_one_corpus(es_client):
     text2 = "I picked up my phone and then dropped it again. I cant seem to get a good grip on things these days. It persists into my everyday tasks"
     text3 = "The weather is great today, but I worry that tomorrow it won't be. My umbrella is in the repair shop."
 
-    assert test_corpus.store_and_index(text1, "doc1", Citation("www.docsource1", "SSSdoc1", corpus_name, ""))
-    assert test_corpus.store_and_index(text2, "doc2", Citation("were.docsource2", "SSSdoc2", corpus_name, ""))
-    assert test_corpus.store_and_index(text3, "doc3", Citation("docsource3.ai", "SSSdoc3", corpus_name, ""))
+    assert test_corpus.store_and_index(text1, "doc1", Citation("www.docsource1", "SSSdoc1", ""))
+    assert test_corpus.store_and_index(text2, "doc2", Citation("were.docsource2", "SSSdoc2", ""))
+    assert test_corpus.store_and_index(text3, "doc3", Citation("docsource3.ai", "SSSdoc3", ""))
 
     time.sleep(1)
     output = test_corpus.search("It is sunny")
     # print("OUTPUT IS : ")
     # print(output)
-    assert "sunshine" in output[0][1]
-    assert "weather" in output[1][1]
+    assert "sunshine" in output[1][0]
+    assert "weather" in output[0][0]
diff --git a/integration-tests/storage_driver/test_corpus_doc_metadata.py b/integration-tests/storage_driver/test_corpus_doc_metadata.py
@@ -14,6 +14,6 @@ def test_insert_and_get():
     corpus_id = uuid.uuid4()
     document_id = uuid.uuid4()
 
-    citation = Citation("google.com", "test google", "googlrCorpus", "just a simple test")
+    citation = Citation("google.com", "test google", "just a simple test")
     metadata.insert_document_metadata(corpus_id, document_id, 1, "test", citation)
     assert metadata.get_document_citation(corpus_id, document_id) == citation
diff --git a/integration-tests/storage_driver/test_corpus_doc_store.py b/integration-tests/storage_driver/test_corpus_doc_store.py
@@ -27,16 +27,16 @@ def test_save_then_search(es_client):
     chunk_id2 = document_id2.hex + '{:032b}'.format(0)
     chunk_id3 = document_id3.hex + '{:032b}'.format(0)
 
-    assert doc_store.save_document(chunk_id1, DocumentEntity(
-        corpus_id1, document_id1, "test1", "MeMaS is great and easy to use"))
-    assert doc_store.save_document(chunk_id2, DocumentEntity(
-        corpus_id2, document_id2, "test2", "MeMaS is coded in python and is horrible"))
-    assert doc_store.save_document(chunk_id3, DocumentEntity(
-        corpus_id1, document_id3, "test3", "Memory Management System"))
+    assert doc_store.save_documents([(chunk_id1, DocumentEntity(
+        corpus_id1, document_id1, "test1", "MeMaS is great and easy to use"))])
+    assert doc_store.save_documents([(chunk_id2, DocumentEntity(
+        corpus_id2, document_id2, "test2", "MeMaS is coded in python and is horrible"))])
+    assert doc_store.save_documents([(chunk_id3, DocumentEntity(
+        corpus_id1, document_id3, "test3", "Memory Management System"))])
 
     time.sleep(2)
 
-    result = doc_store.search_corpus(corpus_id1, "memas is horrible")
+    result = doc_store.search_corpora([corpus_id1], "memas is horrible")
 
     # check that we only found document 1
     assert len(result) == 1

diff --git a/integration-tests/storage_driver/test_corpus_vector_store.py b/integration-tests/storage_driver/test_corpus_vector_store.py
@@ -22,16 +22,16 @@ def test_save_then_search2():
 
     best_match_str = "The sun is high. California sunshine is great. "
 
-    store.save_document(DocumentEntity(corpus_id1, document_id0, "doc0",
-                        "Before This is a runon sentence meant to test the logic of the splitting capabilites but that is only the start, there is nothing that can break this sentecne up other than some handy logic even in the worst case, too bad I only know how to use commas"))
-    store.save_document(DocumentEntity(corpus_id1, document_id1, "doc1",
-                        "The sun is high! California sunshine is great. Did you catch my quest? Oh oh! lol"))
-    store.save_document(DocumentEntity(corpus_id1, document_id2, "doc2",
-                        "I picked up my phone and then dropped it again"))
-    store.save_document(DocumentEntity(corpus_id2, document_id3, "doc3", "The weather is great today"))
+    store.save_documents([DocumentEntity(corpus_id1, document_id0, "doc0",
+                        "Before This is a runon sentence meant to test the logic of the splitting capabilites but that is only the start, there is nothing that can break this sentecne up other than some handy logic even in the worst case, too bad I only know how to use commas")])
+    store.save_documents([DocumentEntity(corpus_id1, document_id1, "doc1",
+                        "The sun is high! California sunshine is great. Did you catch my quest? Oh oh! lol")])
+    store.save_documents([DocumentEntity(corpus_id1, document_id2, "doc2",
+                        "I picked up my phone and then dropped it again")])
+    store.save_documents([DocumentEntity(corpus_id2, document_id3, "doc3", "The weather is great today")])
     time.sleep(1)
 
-    result = store.search(corpus_id1, "How's the weather today?")
+    result = store.search_corpora([corpus_id1], "How's the weather today?")
 
     # assert False
     # Test that the text recovered for a short sentence matched the expected length

diff --git a/memas/corpus/basic_corpus.py b/memas/corpus/basic_corpus.py
@@ -7,14 +7,12 @@
 from memas.interface.storage_driver import DocumentEntity
 from memas.interface.exceptions import SentenceLengthOverflowException
 from memas.context_manager import ctx
-from memas.corpus.corpus_helpers import segment_document
-
-
-_log = logging.getLogger(__name__)
-
+from memas.text_parsing.text_parsers import segment_document
+from memas.corpus.corpus_searching import normalize_and_combine
 
 MAX_SEGMENT_LENGTH = 1536
 
+_log = logging.getLogger(__name__)
 
 class BasicCorpus(Corpus):
 
@@ -32,20 +30,27 @@ def store_and_index(self, document: str, document_name: str, citation: Citation)
         doc_id = uuid.uuid4()
         doc_entity = DocumentEntity(self.corpus_id, doc_id, document_name, document)
 
-        ctx.corpus_vec.save_document(doc_entity)
+        document_chunks = segment_document(document, MAX_SEGMENT_LENGTH)
+
+        # TODO : Need to investigate how to undo when failures on partial insert
+        meta_save = ctx.corpus_metadata.insert_document_metadata(self.corpus_id, doc_id, len(document_chunks), document_name, citation)
+
+        vec_save = ctx.corpus_vec.save_documents([doc_entity])
 
         # Divide longer documents for document store
-        document_chunks = segment_document(document, MAX_SEGMENT_LENGTH)
-        chunk_num = 1
+        chunk_num = 0
+        chunk_id_entity_pairs = []
         for chunk in document_chunks:
             # Create the new IDs for the document chunk combo
-            chunkID = doc_id.hex + '{:032b}'.format(chunk_num)
+            chunk_id = doc_id.hex + '{:032b}'.format(chunk_num)
             chunk_num = chunk_num + 1
             doc_chunk_entity = DocumentEntity(self.corpus_id, doc_id, document_name, chunk)
-            ctx.corpus_doc.save_document(chunkID, doc_chunk_entity)
+            chunk_id_entity_pairs.append((chunk_id, doc_chunk_entity))
+
+        # Insert all chunks of document at once
+        doc_save = ctx.corpus_doc.save_documents(id_doc_pairs=chunk_id_entity_pairs)
 
-        # TODO: Need to redo this return to be indicative of complete success
-        return ctx.corpus_metadata.insert_document_metadata(self.corpus_id, doc_id, chunk_num, document_name, citation)
+        return meta_save and vec_save and doc_save
 
     """
     The most basic search of a document store via Elastic Search and a Vector DB
@@ -60,17 +65,16 @@ def search(self, clue: str) -> list[tuple[float, str, Citation]]:
         vector_search_count: int = 10
 
         doc_store_results: list[tuple[float, str, Citation]] = []
-        temp_res = ctx.corpus_doc.search_corpus(self.corpus_id, clue)
+        temp_res = ctx.corpus_doc.search_corpora([self.corpus_id], clue)
         # Search the document store
         for score, doc_entity in temp_res:
             document_text = doc_entity.document
             citation = ctx.corpus_metadata.get_document_citation(self.corpus_id, doc_entity.document_id)
-
             doc_store_results.append([score, document_text, citation])
 
         # Search for the vectors
         vec_store_results: list[tuple[float, str, Citation]] = []
-        temp_res2 = ctx.corpus_vec.search(self.corpus_id, clue)
+        temp_res2 =  ctx.corpus_vec.search_corpora([self.corpus_id], clue)
         for score, doc_entity, start_index, end_index in temp_res2:
 
             # Verify that the text recovered from the vectors fits the maximum sentence criteria
@@ -79,13 +83,8 @@ def search(self, clue: str) -> list[tuple[float, str, Citation]]:
                 raise SentenceLengthOverflowException(end_index - start_index)
 
             citation = ctx.corpus_metadata.get_document_citation(self.corpus_id, doc_entity.document_id)
-
             vec_store_results.append([score, doc_entity.document, citation])
 
-        # print("Docs then Vecs : ")
-        # print(doc_store_results)
-        # print(vec_store_results)
-
         # If any of the searches returned no results combine and return
         if len(vec_store_results) == 0:
             doc_store_results.sort(key=lambda x: x[0], reverse=True)
@@ -103,67 +102,6 @@ def generate_search_instructions(self, clue: str) -> any:
         pass
 
 
-def normalize_and_combine(doc_results: list, vec_results: list):
-    # normalization with assumption that top score matches are approximately equal
-
-    # Vec scores are based on distance, so smaller is better. Need to inverse the
-    # order to be comparable to something like elastic search where bigger is better.
-    doc_scores = ([x[0] for x in doc_results])
-    vec_scores = ([x[0] for x in vec_results])
-
-    doc_max_score = max(doc_scores)
-    doc_min_score = min(doc_scores)
-
-    vec_max_score = max(vec_scores)
-    vec_min_score = min(vec_scores)
-
-    doc_results_normalized = []
-    vec_results_normalized = []
-
-    # Normalize and shift doc results to be between 0 and 1, with 1 being best responses and 0 being worst
-    if (doc_max_score != doc_min_score):
-        doc_results_normalized = [[(x - doc_min_score) / (doc_max_score - doc_min_score), y, z]
-                                  for [x, y, z] in doc_results]
-
-    # Vector results assume L2 distance of unit vectors so the range is between 0 and 2.
-    # if(vec_max_score != vec_min_score) :
-        # vec_results_normalized = [[(vec_max_score - x) / (vec_max_score - vec_min_score), y, z]
-        #                       for [x, y, z] in vec_results]
-    vec_results_normalized = [[2 - x, y, z] for [x, y, z] in vec_results]
-
-    # Reward documents that contain high scoring vectors and remove the searched vector.
-
-    # TODO : Check that this isn't super slow for larger documents and more search results
-
-    # Was considering adjusting the score reward by the document length when a document
-    # has a vector within it. Idea was longer docs share more sentences, so they're over rewarded.
-    # That might just mean its a good document, so it should recieve that score increase.
-    # avg_doc_len = reduce(lambda x, y : len(x[1]) + len(y[1]), doc_results_normalized)
-
-    duplicate_vec_indicies = []
-    doc_index = 0
-    for doc_score, doc_text, doc_citation in doc_results_normalized:
-        vec_index = 0
-
-        for vec_score, vec_text, vec_citation in vec_results_normalized:
-            if (vec_text in doc_text):
-                duplicate_vec_indicies.append(vec_index)
-                doc_score = doc_score + vec_score
-            vec_index = vec_index + 1
-
-        doc_results_normalized[doc_index][0] = doc_score
-        doc_index = doc_index + 1
-
-    unique_vectors = [i for j, i in enumerate(vec_results_normalized) if j not in duplicate_vec_indicies]
-
-    doc_results_normalized.extend(unique_vectors)
-
-    # Sort by descending scoring so best results come first
-    doc_results_normalized.sort(key=lambda x: x[0], reverse=True)
-
-    return doc_results_normalized
-
-
 class BasicCorpusFactory(CorpusFactory):
     def produce(self, corpus_id: uuid.UUID):
         # TODO: Maybe change the Corpus Name Parameter

diff --git a/memas/corpus/corpus_helpers.py b/memas/corpus/corpus_helpers.py