Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

MB-62230 - Pre-Filtering Support for kNN #2063

Merged
merged 18 commits into from
Sep 10, 2024
Merged
Show file tree
Hide file tree
Changes from 16 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 20 additions & 0 deletions docs/vectors.md
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,26 @@ if err != nil {
fmt.Println(searchResult.Hits)
```

## Querying with Filters (v2.4.3+)
```go
searchRequest := NewSearchRequest(query.NewMatchNoneQuery())

filterQuery := NewTermQuery("hello")

searchRequest.AddKNNWithFilter(
"vec", // vector field name
[]float32{10,11,12,13,14,15,16,17,18,19}, // query vector (same dims)
5, // k
0, // boost
filterQuery, // filter query
)
searchResult, err := index.Search(searchRequest)
if err != nil {
panic(err)
}
fmt.Println(searchResult.Hits)
```

## Setup Instructions

* Using `cmake` is a recommended approach by FAISS authors.
Expand Down
8 changes: 4 additions & 4 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,13 @@ go 1.21
require (
github.com/RoaringBitmap/roaring v1.9.3
github.com/bits-and-blooms/bitset v1.12.0
github.com/blevesearch/bleve_index_api v1.1.11
github.com/blevesearch/bleve_index_api v1.1.12
github.com/blevesearch/geo v0.1.20
github.com/blevesearch/go-metrics v0.0.0-20201227073835-cf1acfcdf475
github.com/blevesearch/go-porterstemmer v1.0.3
github.com/blevesearch/goleveldb v1.0.1
github.com/blevesearch/gtreap v0.1.1
github.com/blevesearch/scorch_segment_api/v2 v2.2.15
github.com/blevesearch/scorch_segment_api/v2 v2.2.16
github.com/blevesearch/segment v0.9.1
github.com/blevesearch/snowball v0.6.1
github.com/blevesearch/snowballstem v0.9.0
Expand All @@ -23,7 +23,7 @@ require (
github.com/blevesearch/zapx/v13 v13.3.10
github.com/blevesearch/zapx/v14 v14.3.10
github.com/blevesearch/zapx/v15 v15.3.13
github.com/blevesearch/zapx/v16 v16.1.6-0.20240904144721-dbcb3c000a29
github.com/blevesearch/zapx/v16 v16.1.6-0.20240909182401-e148470cefbe
github.com/couchbase/moss v0.2.0
github.com/golang/protobuf v1.3.2
github.com/spf13/cobra v1.7.0
Expand All @@ -32,7 +32,7 @@ require (
)

require (
github.com/blevesearch/go-faiss v1.0.21 // indirect
github.com/blevesearch/go-faiss v1.0.22-0.20240909180832-35a1ff78ead4 // indirect
github.com/blevesearch/mmap-go v1.0.4 // indirect
github.com/couchbase/ghistogram v0.1.0 // indirect
github.com/golang/geo v0.0.0-20210211234256-740aa86cb551 // indirect
Expand Down
16 changes: 8 additions & 8 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,12 @@ github.com/RoaringBitmap/roaring v1.9.3 h1:t4EbC5qQwnisr5PrP9nt0IRhRTb9gMUgQF4t4
github.com/RoaringBitmap/roaring v1.9.3/go.mod h1:6AXUsoIEzDTFFQCe1RbGA6uFONMhvejWj5rqITANK90=
github.com/bits-and-blooms/bitset v1.12.0 h1:U/q1fAF7xXRhFCrhROzIfffYnu+dlS38vCZtmFVPHmA=
github.com/bits-and-blooms/bitset v1.12.0/go.mod h1:7hO7Gc7Pp1vODcmWvKMRA9BNmbv6a/7QIWpPxHddWR8=
github.com/blevesearch/bleve_index_api v1.1.11 h1:OTNpRnxPWFIhMSgBUBlkD7RVWYrfsojtQeACb8tGGpw=
github.com/blevesearch/bleve_index_api v1.1.11/go.mod h1:PbcwjIcRmjhGbkS/lJCpfgVSMROV6TRubGGAODaK1W8=
github.com/blevesearch/bleve_index_api v1.1.12 h1:P4bw9/G/5rulOF7SJ9l4FsDoo7UFJ+5kexNy1RXfegY=
github.com/blevesearch/bleve_index_api v1.1.12/go.mod h1:PbcwjIcRmjhGbkS/lJCpfgVSMROV6TRubGGAODaK1W8=
github.com/blevesearch/geo v0.1.20 h1:paaSpu2Ewh/tn5DKn/FB5SzvH0EWupxHEIwbCk/QPqM=
github.com/blevesearch/geo v0.1.20/go.mod h1:DVG2QjwHNMFmjo+ZgzrIq2sfCh6rIHzy9d9d0B59I6w=
github.com/blevesearch/go-faiss v1.0.21 h1:0PdlpvqTC9uP67TBNBVSw+aLoFqo8oulghQ3R9NZ4Pk=
github.com/blevesearch/go-faiss v1.0.21/go.mod h1:OMGQwOaRRYxrmeNdMrXJPvVx8gBnvE5RYrr0BahNnkk=
github.com/blevesearch/go-faiss v1.0.22-0.20240909180832-35a1ff78ead4 h1:riy8XP3UIBeVjMhsq1r1aGfjvTf3aPp2PuXxdiw9P4s=
github.com/blevesearch/go-faiss v1.0.22-0.20240909180832-35a1ff78ead4/go.mod h1:OMGQwOaRRYxrmeNdMrXJPvVx8gBnvE5RYrr0BahNnkk=
github.com/blevesearch/go-metrics v0.0.0-20201227073835-cf1acfcdf475 h1:kDy+zgJFJJoJYBvdfBSiZYBbdsUL0XcjHYWezpQBGPA=
github.com/blevesearch/go-metrics v0.0.0-20201227073835-cf1acfcdf475/go.mod h1:9eJDeqxJ3E7WnLebQUlPD7ZjSce7AnDb9vjGmMCbD0A=
github.com/blevesearch/go-porterstemmer v1.0.3 h1:GtmsqID0aZdCSNiY8SkuPJ12pD4jI+DdXTAn4YRcHCo=
Expand All @@ -19,8 +19,8 @@ github.com/blevesearch/gtreap v0.1.1/go.mod h1:QaQyDRAT51sotthUWAH4Sj08awFSSWzgY
github.com/blevesearch/mmap-go v1.0.2/go.mod h1:ol2qBqYaOUsGdm7aRMRrYGgPvnwLe6Y+7LMvAB5IbSA=
github.com/blevesearch/mmap-go v1.0.4 h1:OVhDhT5B/M1HNPpYPBKIEJaD0F3Si+CrEKULGCDPWmc=
github.com/blevesearch/mmap-go v1.0.4/go.mod h1:EWmEAOmdAS9z/pi/+Toxu99DnsbhG1TIxUoRmJw/pSs=
github.com/blevesearch/scorch_segment_api/v2 v2.2.15 h1:prV17iU/o+A8FiZi9MXmqbagd8I0bCqM7OKUYPbnb5Y=
github.com/blevesearch/scorch_segment_api/v2 v2.2.15/go.mod h1:db0cmP03bPNadXrCDuVkKLV6ywFSiRgPFT1YVrestBc=
github.com/blevesearch/scorch_segment_api/v2 v2.2.16 h1:uGvKVvG7zvSxCwcm4/ehBa9cCEuZVE+/zvrSl57QUVY=
github.com/blevesearch/scorch_segment_api/v2 v2.2.16/go.mod h1:VF5oHVbIFTu+znY1v30GjSpT5+9YFs9dV2hjvuh34F0=
github.com/blevesearch/segment v0.9.1 h1:+dThDy+Lvgj5JMxhmOVlgFfkUtZV2kw49xax4+jTfSU=
github.com/blevesearch/segment v0.9.1/go.mod h1:zN21iLm7+GnBHWTao9I+Au/7MBiL8pPFtJBJTsk6kQw=
github.com/blevesearch/snowball v0.6.1 h1:cDYjn/NCH+wwt2UdehaLpr2e4BwLIjN4V/TdLsL+B5A=
Expand All @@ -43,8 +43,8 @@ github.com/blevesearch/zapx/v14 v14.3.10 h1:SG6xlsL+W6YjhX5N3aEiL/2tcWh3DO75Bnz7
github.com/blevesearch/zapx/v14 v14.3.10/go.mod h1:qqyuR0u230jN1yMmE4FIAuCxmahRQEOehF78m6oTgns=
github.com/blevesearch/zapx/v15 v15.3.13 h1:6EkfaZiPlAxqXz0neniq35my6S48QI94W/wyhnpDHHQ=
github.com/blevesearch/zapx/v15 v15.3.13/go.mod h1:Turk/TNRKj9es7ZpKK95PS7f6D44Y7fAFy8F4LXQtGg=
github.com/blevesearch/zapx/v16 v16.1.6-0.20240904144721-dbcb3c000a29 h1:SpzoLVHa6DlbtS1o5WZu1qBzE15nQHKOViZkJHd0+XQ=
github.com/blevesearch/zapx/v16 v16.1.6-0.20240904144721-dbcb3c000a29/go.mod h1:9WSiNE0zM1mu9cuJqq8peTXxccVHGIcIzeG/QKX13fc=
github.com/blevesearch/zapx/v16 v16.1.6-0.20240909182401-e148470cefbe h1:S1rCvhrU2HqDrRtogYgM52rT5px7o2zFIB3Yo+JPFOU=
github.com/blevesearch/zapx/v16 v16.1.6-0.20240909182401-e148470cefbe/go.mod h1:x9Kg015zbkSXxmE7F+0qeGxpeHJBwkDuxosrrDxYltU=
github.com/couchbase/ghistogram v0.1.0 h1:b95QcQTCzjTUocDXp/uMgSNQi8oj1tGwnJ4bODWZnps=
github.com/couchbase/ghistogram v0.1.0/go.mod h1:s1Jhy76zqfEecpNWJfWUiKZookAFaiGOEoyzgHt9i7k=
github.com/couchbase/moss v0.2.0 h1:VCYrMzFwEryyhRSeI+/b3tRBSeTpi/8gn5Kf6dxqn+o=
Expand Down
37 changes: 35 additions & 2 deletions index/scorch/optimize_knn.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@ type OptimizeVR struct {
totalCost uint64
// maps field to vector readers
vrs map[string][]*IndexSnapshotVectorReader
// if at least one of the vector readers requires filtered kNN.
requiresFiltering bool
}

// This setting _MUST_ only be changed during init and not after.
Expand Down Expand Up @@ -62,6 +64,8 @@ func (o *OptimizeVR) Finish() error {
var errorsM sync.Mutex
var errors []error

snapshotGlobalDocNums := o.snapshot.globalDocNums()

defer o.invokeSearcherEndCallback()

wg := sync.WaitGroup{}
Expand All @@ -77,7 +81,8 @@ func (o *OptimizeVR) Finish() error {
wg.Done()
}()
for field, vrs := range o.vrs {
vecIndex, err := segment.InterpretVectorIndex(field, origSeg.deleted)
vecIndex, err := segment.InterpretVectorIndex(field,
o.requiresFiltering, origSeg.deleted)
if err != nil {
errorsM.Lock()
errors = append(errors, err)
Expand All @@ -89,9 +94,34 @@ func (o *OptimizeVR) Finish() error {
vectorIndexSize := vecIndex.Size()
origSeg.cachedMeta.updateMeta(field, vectorIndexSize)
for _, vr := range vrs {
eligibleVectorInternalIDs := vr.getEligibleDocIDs()
if snapshotGlobalDocNums != nil {
// Only the eligible documents belonging to this segment
// will get filtered out.
// There is no way to determine which doc belongs to which segment
eligibleVectorInternalIDs.And(snapshotGlobalDocNums[index])
}

eligibleLocalDocNums := make([]uint64,
eligibleVectorInternalIDs.Stats().Cardinality)
// get the (segment-)local document numbers
for i, docNum := range eligibleVectorInternalIDs.ToArray() {
localDocNum := o.snapshot.localDocNumFromGlobal(index,
uint64(docNum))
eligibleLocalDocNums[i] = localDocNum
}

var pl segment_api.VecPostingsList
var err error
// for each VR, populate postings list and iterators
// by passing the obtained vector index and getting similar vectors.
pl, err := vecIndex.Search(vr.vector, vr.k, vr.searchParams)
if vr.eligibleDocIDs != nil && len(vr.eligibleDocIDs) > 0 {
pl, err = vecIndex.SearchWithFilter(vr.vector, vr.k,
eligibleLocalDocNums, vr.searchParams)
} else {
pl, err = vecIndex.Search(vr.vector, vr.k, vr.searchParams)
}

if err != nil {
errorsM.Lock()
errors = append(errors, err)
Expand Down Expand Up @@ -140,6 +170,9 @@ func (s *IndexSnapshotVectorReader) VectorOptimize(ctx context.Context,
return octx, nil
}
o.ctx = ctx
if !o.requiresFiltering {
o.requiresFiltering = len(s.eligibleDocIDs) > 0
}

if o.snapshot != s.snapshot {
o.invokeSearcherEndCallback()
Expand Down
30 changes: 29 additions & 1 deletion index/scorch/snapshot_index.go
Original file line number Diff line number Diff line change
Expand Up @@ -471,16 +471,44 @@ func (is *IndexSnapshot) Document(id string) (rv index.Document, err error) {
return rvd, nil
}

// In a multi-segment index, each document has:
// 1. a local docnum - local to the segment
// 2. a global docnum - unique identifier across the index
// This function returns the segment index(the segment in which the docnum is present)
// and local docnum of a document.
func (is *IndexSnapshot) segmentIndexAndLocalDocNumFromGlobal(docNum uint64) (int, uint64) {
segmentIndex := sort.Search(len(is.offsets),
func(x int) bool {
return is.offsets[x] > docNum
}) - 1

localDocNum := docNum - is.offsets[segmentIndex]
localDocNum := is.localDocNumFromGlobal(segmentIndex, docNum)
return int(segmentIndex), localDocNum
}

// This function returns the local docnum, given the segment index and global docnum
func (is *IndexSnapshot) localDocNumFromGlobal(segmentIndex int, docNum uint64) uint64 {
return docNum - is.offsets[segmentIndex]
}

// Function to return a mapping of the segment index to the live global doc nums
// in the segment of the specified index snapshot.
func (is *IndexSnapshot) globalDocNums() map[int]*roaring.Bitmap {
if len(is.segment) == 0 {
return nil
}

segmentIndexGlobalDocNums := make(map[int]*roaring.Bitmap)
metonymic-smokey marked this conversation as resolved.
Show resolved Hide resolved

for i := range is.segment {
segmentIndexGlobalDocNums[i] = roaring.NewBitmap()
for _, localDocNum := range is.segment[i].DocNumbersLive().ToArray() {
segmentIndexGlobalDocNums[i].Add(localDocNum + uint32(is.offsets[i]))
}
}
return segmentIndexGlobalDocNums
}

func (is *IndexSnapshot) ExternalID(id index.IndexInternalID) (string, error) {
docNum, err := docInternalToNumber(id)
if err != nil {
Expand Down
31 changes: 30 additions & 1 deletion index/scorch/snapshot_index_vr.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ import (
"fmt"
"reflect"

"github.com/RoaringBitmap/roaring"
"github.com/blevesearch/bleve/v2/size"
index "github.com/blevesearch/bleve_index_api"
segment_api "github.com/blevesearch/scorch_segment_api/v2"
Expand Down Expand Up @@ -51,6 +52,24 @@ type IndexSnapshotVectorReader struct {
ctx context.Context

searchParams json.RawMessage

// The following fields are only applicable for vector readers which will
// process kNN queries.
eligibleDocIDs []index.IndexInternalID
}

// Function to convert the internal IDs of the eligible documents to a type suitable
// for addition to a bitmap.
// Useful to have the eligible doc IDs in a bitmap to leverage the fast intersection
// (AND) operations. Eg. finding the eligible doc IDs present in a segment.
func (i *IndexSnapshotVectorReader) getEligibleDocIDs() *roaring.Bitmap {
res := roaring.NewBitmap()
abhinavdangeti marked this conversation as resolved.
Show resolved Hide resolved
// converts the doc IDs to uint32 and returns
for _, eligibleDocInternalID := range i.eligibleDocIDs {
internalDocID, _ := docInternalToNumber(index.IndexInternalID(eligibleDocInternalID))
res.Add(uint32(internalDocID))
}
return res
}

func (i *IndexSnapshotVectorReader) Size() int {
Expand Down Expand Up @@ -108,7 +127,17 @@ func (i *IndexSnapshotVectorReader) Advance(ID index.IndexInternalID,
preAlloced *index.VectorDoc) (*index.VectorDoc, error) {

if i.currPosting != nil && bytes.Compare(i.currID, ID) >= 0 {
i2, err := i.snapshot.VectorReader(i.ctx, i.vector, i.field, i.k, i.searchParams)
var i2 index.VectorReader
var err error

if len(i.eligibleDocIDs) > 0 {
i2, err = i.snapshot.VectorReaderWithFilter(i.ctx, i.vector, i.field,
i.k, i.searchParams, i.eligibleDocIDs)
} else {
i2, err = i.snapshot.VectorReader(i.ctx, i.vector, i.field, i.k,
i.searchParams)
}

if err != nil {
return nil, err
}
Expand Down
26 changes: 26 additions & 0 deletions index/scorch/snapshot_vector_index.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,3 +48,29 @@ func (is *IndexSnapshot) VectorReader(ctx context.Context, vector []float32,

return rv, nil
}

func (is *IndexSnapshot) VectorReaderWithFilter(ctx context.Context, vector []float32,
field string, k int64, searchParams json.RawMessage,
filterIDs []index.IndexInternalID) (
index.VectorReader, error) {

rv := &IndexSnapshotVectorReader{
vector: vector,
field: field,
k: k,
snapshot: is,
searchParams: searchParams,
eligibleDocIDs: filterIDs,
}

if rv.postings == nil {
rv.postings = make([]segment_api.VecPostingsList, len(is.segment))
}
if rv.iterators == nil {
rv.iterators = make([]segment_api.VecPostingsIterator, len(is.segment))
}

// initialize postings and iterators within the OptimizeVR's Finish()

return rv, nil
}
Loading
Loading