Skip to content

Commit

Permalink
bug: fix for pinecone not working for per document updates (deepset-a…
Browse files Browse the repository at this point in the history
  • Loading branch information
vblagoje authored Jul 3, 2023
1 parent 1be3936 commit 1066e95
Show file tree
Hide file tree
Showing 2 changed files with 124 additions and 8 deletions.
13 changes: 6 additions & 7 deletions haystack/document_stores/pinecone.py
Original file line number Diff line number Diff line change
Expand Up @@ -297,12 +297,9 @@ def get_document_count(
pinecone_syntax_filter = LogicalFilterClause.parse(filters).convert_to_pinecone() if filters else None

stats = self.pinecone_indexes[index].describe_index_stats(filter=pinecone_syntax_filter)
# Document count is total number of vectors across all namespaces (no-vectors + vectors)
count = 0
for namespace in stats["namespaces"].keys():
if not (only_documents_without_embedding and "no-vectors" not in namespace):
count += stats["namespaces"][namespace]["vector_count"]
return count
if only_documents_without_embedding:
return sum(value["vector_count"] for key, value in stats["namespaces"].items() if "no-vectors" in key)
return sum(value["vector_count"] for value in stats["namespaces"].values())

def _validate_index_sync(self, index: Optional[str] = None):
"""
Expand Down Expand Up @@ -497,7 +494,9 @@ def update_embeddings(
f"Couldn't find a the index '{index}' in Pinecone. Try to init the "
f"PineconeDocumentStore() again ..."
)
document_count = self.get_document_count(index=index, filters=filters)
document_count = self.get_document_count(
index=index, filters=filters, only_documents_without_embedding=not update_existing_embeddings
)
if document_count == 0:
logger.warning("Calling DocumentStore.update_embeddings() on an empty index")
return
Expand Down
119 changes: 118 additions & 1 deletion test/document_stores/test_pinecone.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import os
import numpy as np
from inspect import getmembers, isclass, isfunction
from unittest.mock import MagicMock, ANY
from unittest.mock import MagicMock

import pytest

Expand Down Expand Up @@ -471,6 +471,123 @@ def test_get_embedding_count(self, doc_store_with_docs: PineconeDocumentStore):
doc_store_with_docs.write_documents([doc])
assert doc_store_with_docs.get_embedding_count() == 1

@pytest.mark.integration
def test_get_document_count_after_write_doc_with_embedding(self, doc_store_with_docs: PineconeDocumentStore):
"""
Tests that get_document_count() returns the correct number of documents in the document store after a document
with an embedding is written to the document store.
"""
# there are 9 docs in doc_store_with_docs (all without embeddings)
initial_document_count = 9

# we expect initial_document_count documents without embeddings in doc_store_with_docs
assert doc_store_with_docs.get_document_count(only_documents_without_embedding=True) == initial_document_count
# and also initial_document_count documents in total
assert doc_store_with_docs.get_document_count() == initial_document_count

# document with embedding is written to doc_store_with_docs
doc = Document(content=f"Doc with embedding", embedding=np.random.rand(768).astype(np.float32))
doc_store_with_docs.write_documents([doc])

# so we expect initial_document_count + 1 documents in total
assert doc_store_with_docs.get_document_count() == initial_document_count + 1

# but we expect initial_document_count documents without embeddings to be unchanged
assert doc_store_with_docs.get_document_count(only_documents_without_embedding=True) == initial_document_count

@pytest.mark.integration
def test_get_document_count_after_write_doc_without_embedding(self, doc_store_with_docs: PineconeDocumentStore):
"""
Tests that get_document_count() returns the correct number of documents in the document store after a document
without an embedding is written to the document store.
"""
# there are 9 docs in doc_store_with_docs (all without embeddings)
initial_document_count = 9

# we expect initial_document_count documents without embeddings in doc_store_with_docs
assert doc_store_with_docs.get_document_count(only_documents_without_embedding=True) == initial_document_count
# and we also expect initial_document_count documents in total
assert doc_store_with_docs.get_document_count() == initial_document_count

# document without embedding is written to doc_store_with_docs
doc = Document(content=f"Doc without embedding")
doc_store_with_docs.write_documents([doc])

# we now expect initial_document_count + 1 documents in total
assert doc_store_with_docs.get_document_count() == initial_document_count + 1

# And we also expect initial_document_count + 1 documents without embeddings, because the document we just
# wrote has no embeddings
assert (
doc_store_with_docs.get_document_count(only_documents_without_embedding=True) == initial_document_count + 1
)

@pytest.mark.integration
def test_get_document_count_after_delete_doc_with_embedding(self, doc_store_with_docs: PineconeDocumentStore):
"""
Tests that get_document_count() returns the correct number of documents in the document store after a document
with an embedding is deleted from the document store.
"""
# there are 9 docs in doc_store_with_docs (all without embeddings)
initial_document_count = 9

# we expect initial_document_count documents without embeddings in doc_store_with_docs
assert doc_store_with_docs.get_document_count(only_documents_without_embedding=True) == initial_document_count
# and also initial_document_count documents in total
assert doc_store_with_docs.get_document_count() == initial_document_count

# two documents with embedding are written to doc_store_with_docs
doc_1 = Document(content=f"Doc with embedding 1", embedding=np.random.rand(768).astype(np.float32))
doc_2 = Document(content=f"Doc with embedding 2", embedding=np.random.rand(768).astype(np.float32))
doc_store_with_docs.write_documents([doc_1, doc_2])

# total number is initial_document_count + 2
assert doc_store_with_docs.get_document_count() == initial_document_count + 2

# remove one of the documents with embedding
all_embedding_docs = doc_store_with_docs.get_all_documents(namespace="vectors")
doc_store_with_docs.delete_documents(ids=[all_embedding_docs[0].id])

# since we deleted one doc, we expect initial_document_count + 1 documents in total
assert doc_store_with_docs.get_document_count() == initial_document_count + 1

# and we expect initial_document_count documents without embeddings
assert doc_store_with_docs.get_document_count(only_documents_without_embedding=True) == initial_document_count

@pytest.mark.integration
def test_get_document_count_after_delete_doc_without_embedding(self, doc_store_with_docs: PineconeDocumentStore):
"""
Tests that get_document_count() returns the correct number of documents in the document store after a document
without embedding is deleted from the document store.
"""
# there are 9 docs in doc_store_with_docs (all without embeddings)
initial_document_count = 9

# therefore we expect initial_document_count documents without embeddings in doc_store_with_docs
assert doc_store_with_docs.get_document_count(only_documents_without_embedding=True) == initial_document_count
# and also initial_document_count documents in total
assert doc_store_with_docs.get_document_count() == initial_document_count

# two documents without embedding are written to doc_store_with_docs
doc_1 = Document(content=f"Doc with embedding 1", embedding=None)
doc_2 = Document(content=f"Doc with embedding 2", embedding=None)
doc_store_with_docs.write_documents([doc_1, doc_2])

# total number is initial_document_count + 2
assert doc_store_with_docs.get_document_count() == initial_document_count + 2

# remove one of the documents without embedding
all_non_embedding_docs = doc_store_with_docs.get_all_documents(namespace="no-vectors")
doc_store_with_docs.delete_documents(ids=[all_non_embedding_docs[0].id])

# since we deleted one doc, we expect initial_document_count + 1 documents in total
assert doc_store_with_docs.get_document_count() == initial_document_count + 1

# and we expect initial_document_count +1 documents without embeddings as well
assert (
doc_store_with_docs.get_document_count(only_documents_without_embedding=True) == initial_document_count + 1
)

@pytest.mark.unit
def test_get_all_labels_legacy_document_id(self, ds, monkeypatch):
monkeypatch.setattr(
Expand Down

0 comments on commit 1066e95

Please sign in to comment.