From 1066e959a2b87f1c1b69a74fecbca085466084f2 Mon Sep 17 00:00:00 2001 From: Vladimir Blagojevic Date: Mon, 3 Jul 2023 14:07:52 +0200 Subject: [PATCH] bug: fix for pinecone not working for per document updates (#5110) --- haystack/document_stores/pinecone.py | 13 ++- test/document_stores/test_pinecone.py | 119 +++++++++++++++++++++++++- 2 files changed, 124 insertions(+), 8 deletions(-) diff --git a/haystack/document_stores/pinecone.py b/haystack/document_stores/pinecone.py index 5981b6bd27..7051618c18 100644 --- a/haystack/document_stores/pinecone.py +++ b/haystack/document_stores/pinecone.py @@ -297,12 +297,9 @@ def get_document_count( pinecone_syntax_filter = LogicalFilterClause.parse(filters).convert_to_pinecone() if filters else None stats = self.pinecone_indexes[index].describe_index_stats(filter=pinecone_syntax_filter) - # Document count is total number of vectors across all namespaces (no-vectors + vectors) - count = 0 - for namespace in stats["namespaces"].keys(): - if not (only_documents_without_embedding and "no-vectors" not in namespace): - count += stats["namespaces"][namespace]["vector_count"] - return count + if only_documents_without_embedding: + return sum(value["vector_count"] for key, value in stats["namespaces"].items() if "no-vectors" in key) + return sum(value["vector_count"] for value in stats["namespaces"].values()) def _validate_index_sync(self, index: Optional[str] = None): """ @@ -497,7 +494,9 @@ def update_embeddings( f"Couldn't find a the index '{index}' in Pinecone. Try to init the " f"PineconeDocumentStore() again ..." ) - document_count = self.get_document_count(index=index, filters=filters) + document_count = self.get_document_count( + index=index, filters=filters, only_documents_without_embedding=not update_existing_embeddings + ) if document_count == 0: logger.warning("Calling DocumentStore.update_embeddings() on an empty index") return diff --git a/test/document_stores/test_pinecone.py b/test/document_stores/test_pinecone.py index 56c5caab46..4a3a9f28e2 100644 --- a/test/document_stores/test_pinecone.py +++ b/test/document_stores/test_pinecone.py @@ -3,7 +3,7 @@ import os import numpy as np from inspect import getmembers, isclass, isfunction -from unittest.mock import MagicMock, ANY +from unittest.mock import MagicMock import pytest @@ -471,6 +471,123 @@ def test_get_embedding_count(self, doc_store_with_docs: PineconeDocumentStore): doc_store_with_docs.write_documents([doc]) assert doc_store_with_docs.get_embedding_count() == 1 + @pytest.mark.integration + def test_get_document_count_after_write_doc_with_embedding(self, doc_store_with_docs: PineconeDocumentStore): + """ + Tests that get_document_count() returns the correct number of documents in the document store after a document + with an embedding is written to the document store. + """ + # there are 9 docs in doc_store_with_docs (all without embeddings) + initial_document_count = 9 + + # we expect initial_document_count documents without embeddings in doc_store_with_docs + assert doc_store_with_docs.get_document_count(only_documents_without_embedding=True) == initial_document_count + # and also initial_document_count documents in total + assert doc_store_with_docs.get_document_count() == initial_document_count + + # document with embedding is written to doc_store_with_docs + doc = Document(content=f"Doc with embedding", embedding=np.random.rand(768).astype(np.float32)) + doc_store_with_docs.write_documents([doc]) + + # so we expect initial_document_count + 1 documents in total + assert doc_store_with_docs.get_document_count() == initial_document_count + 1 + + # but we expect initial_document_count documents without embeddings to be unchanged + assert doc_store_with_docs.get_document_count(only_documents_without_embedding=True) == initial_document_count + + @pytest.mark.integration + def test_get_document_count_after_write_doc_without_embedding(self, doc_store_with_docs: PineconeDocumentStore): + """ + Tests that get_document_count() returns the correct number of documents in the document store after a document + without an embedding is written to the document store. + """ + # there are 9 docs in doc_store_with_docs (all without embeddings) + initial_document_count = 9 + + # we expect initial_document_count documents without embeddings in doc_store_with_docs + assert doc_store_with_docs.get_document_count(only_documents_without_embedding=True) == initial_document_count + # and we also expect initial_document_count documents in total + assert doc_store_with_docs.get_document_count() == initial_document_count + + # document without embedding is written to doc_store_with_docs + doc = Document(content=f"Doc without embedding") + doc_store_with_docs.write_documents([doc]) + + # we now expect initial_document_count + 1 documents in total + assert doc_store_with_docs.get_document_count() == initial_document_count + 1 + + # And we also expect initial_document_count + 1 documents without embeddings, because the document we just + # wrote has no embeddings + assert ( + doc_store_with_docs.get_document_count(only_documents_without_embedding=True) == initial_document_count + 1 + ) + + @pytest.mark.integration + def test_get_document_count_after_delete_doc_with_embedding(self, doc_store_with_docs: PineconeDocumentStore): + """ + Tests that get_document_count() returns the correct number of documents in the document store after a document + with an embedding is deleted from the document store. + """ + # there are 9 docs in doc_store_with_docs (all without embeddings) + initial_document_count = 9 + + # we expect initial_document_count documents without embeddings in doc_store_with_docs + assert doc_store_with_docs.get_document_count(only_documents_without_embedding=True) == initial_document_count + # and also initial_document_count documents in total + assert doc_store_with_docs.get_document_count() == initial_document_count + + # two documents with embedding are written to doc_store_with_docs + doc_1 = Document(content=f"Doc with embedding 1", embedding=np.random.rand(768).astype(np.float32)) + doc_2 = Document(content=f"Doc with embedding 2", embedding=np.random.rand(768).astype(np.float32)) + doc_store_with_docs.write_documents([doc_1, doc_2]) + + # total number is initial_document_count + 2 + assert doc_store_with_docs.get_document_count() == initial_document_count + 2 + + # remove one of the documents with embedding + all_embedding_docs = doc_store_with_docs.get_all_documents(namespace="vectors") + doc_store_with_docs.delete_documents(ids=[all_embedding_docs[0].id]) + + # since we deleted one doc, we expect initial_document_count + 1 documents in total + assert doc_store_with_docs.get_document_count() == initial_document_count + 1 + + # and we expect initial_document_count documents without embeddings + assert doc_store_with_docs.get_document_count(only_documents_without_embedding=True) == initial_document_count + + @pytest.mark.integration + def test_get_document_count_after_delete_doc_without_embedding(self, doc_store_with_docs: PineconeDocumentStore): + """ + Tests that get_document_count() returns the correct number of documents in the document store after a document + without embedding is deleted from the document store. + """ + # there are 9 docs in doc_store_with_docs (all without embeddings) + initial_document_count = 9 + + # therefore we expect initial_document_count documents without embeddings in doc_store_with_docs + assert doc_store_with_docs.get_document_count(only_documents_without_embedding=True) == initial_document_count + # and also initial_document_count documents in total + assert doc_store_with_docs.get_document_count() == initial_document_count + + # two documents without embedding are written to doc_store_with_docs + doc_1 = Document(content=f"Doc with embedding 1", embedding=None) + doc_2 = Document(content=f"Doc with embedding 2", embedding=None) + doc_store_with_docs.write_documents([doc_1, doc_2]) + + # total number is initial_document_count + 2 + assert doc_store_with_docs.get_document_count() == initial_document_count + 2 + + # remove one of the documents without embedding + all_non_embedding_docs = doc_store_with_docs.get_all_documents(namespace="no-vectors") + doc_store_with_docs.delete_documents(ids=[all_non_embedding_docs[0].id]) + + # since we deleted one doc, we expect initial_document_count + 1 documents in total + assert doc_store_with_docs.get_document_count() == initial_document_count + 1 + + # and we expect initial_document_count +1 documents without embeddings as well + assert ( + doc_store_with_docs.get_document_count(only_documents_without_embedding=True) == initial_document_count + 1 + ) + @pytest.mark.unit def test_get_all_labels_legacy_document_id(self, ds, monkeypatch): monkeypatch.setattr(