Skip to content

The sparse vectors modifier IDF is not being correctly updated when points are deleted #6735

@raul3820

Description

@raul3820

Current Behavior

With modifier=models.Modifier.NONE
When more vectors (noise) are added the accuracy trends down, as expected.
When vectors are deleted, the accuracy goes back up.

With modifier=models.Modifier.IDF
When more vectors (noise) are added the accuracy trends down, as expected.
When vectors are deleted, the accuracy drops a lot.

Steps to Reproduce

import asyncio
import uuid
from qdrant_client import AsyncQdrantClient, models
from fastembed import SparseTextEmbedding
import string
import random

coll = "test"
vec = "sparse"
loops = 20

def get_doc() -> str:
    words = [ch for ch in string.ascii_letters + string.digits]
    n = len(words)

    # distribution 'middle'
    mid_point = (n - 1) / 2.0
    weights = [n - abs(i - mid_point) for i in range(n)]

    k = max(1, int(random.gauss(20, 5)))
    
    chosen = random.choices(words, weights=weights, k=k)
    return ' '.join(chosen)

async def initialize(client: AsyncQdrantClient, mod: models.Modifier):
    vec_config = {
        vec: models.SparseVectorParams(
            modifier=mod,
        ),
    }

    exists = await client.collection_exists(collection_name=coll)
    if exists:
        print(f"Deleting collection: '{coll}'.")
        await client.delete_collection(
            collection_name=coll,
        )

    print(f"Creating collection: '{coll}'.")
    
    await client.create_collection(
        collection_name=coll,
        sparse_vectors_config=vec_config,
    )

    return


async def upsert(client: AsyncQdrantClient, bm25: SparseTextEmbedding, id: uuid.UUID, text: str):
    
    emb = list(bm25.embed(text))[0].as_object()

    await client.upsert(
        collection_name=coll,
        points=[
            models.PointStruct(
                id=str(id),
                vector={vec: emb},
            )
        ],
        wait=True,
    )


async def delete(client: AsyncQdrantClient, ids: list[uuid.UUID]):
    await client.delete(
        collection_name=coll,
        points_selector=models.PointIdsList(
            points=[str(id) for id in ids],
        ),
        wait=True,
    )


async def search(client: AsyncQdrantClient, bm25: SparseTextEmbedding, query: str):
    
    emb = list(bm25.embed(query))[0].as_object()

    nearest = await client.query_points(
        collection_name=coll,
        query=models.SparseVector(**emb),
        using=vec,
        limit=1,
    )
    if not nearest.points:
        return ""

    return str(nearest.points[0].id)


async def print_info(client: AsyncQdrantClient):
    print("-----")
    info = await client.info()
    print(
        info
    )  # title='qdrant - vector search engine' version='1.14.1' commit='530430fac2a3ca872504f276d2c91a5c91f43fa0'
    info = await client.get_collection(coll)
    print(info)
    print("-----")


async def main(mod: models.Modifier):
    client = AsyncQdrantClient(url="http://localhost:6333")
    bm25 = SparseTextEmbedding(model_name="Qdrant/bm25", local_files_only=True)

    await initialize(client, mod)
    await print_info(client)
    print(f"Testing with modifier: {mod}")

    ids = []
    for h in range(loops):
        sum_success = 0
        for i in range(loops):
            # noise
            for j in range(loops):
                id = uuid.uuid4()
                ids.append(id)
                doc = get_doc()
                await upsert(client, bm25, id, doc)

            # search test
            query = " ".join(random.choices(doc.split(), k=15))
            r = await search(client, bm25, query)
            success = int(str(id) in r)
            # print(f"success: {success}, looking for: {query}, expecting: {doc}")
            
            sum_success += success

        points_action = "(adding more points...)"
        is_delete = False
        if loops // 2 < h:
            points_action = " (deleting points...)"
            is_delete = True

        percentage_str = f"{100 * sum_success / loops:6.2f}%"
        points_action_cleaned = points_action.lstrip()
        print(f"{'success rate:':<14}{percentage_str:<9}{points_action_cleaned}")
        
        if is_delete:
            await delete(client, ids)
            ids = []        

    await print_info(client)


if __name__ == "__main__":
    asyncio.run(main(models.Modifier.NONE))
    asyncio.run(main(models.Modifier.IDF))

Output

Deleting collection: 'test'.
Creating collection: 'test'.
-----
title='qdrant - vector search engine' version='1.14.1' commit='530430fac2a3ca872504f276d2c91a5c91f43fa0'
status=<CollectionStatus.GREEN: 'green'> optimizer_status=<OptimizersStatusOneOf.OK: 'ok'> vectors_count=None indexed_vectors_count=0 points_count=0 segments_count=8 config=CollectionConfig(params=CollectionParams(vectors={}, shard_number=1, sharding_method=None, replication_factor=1, write_consistency_factor=1, read_fan_out_factor=None, on_disk_payload=True, sparse_vectors={'sparse': SparseVectorParams(index=None, modifier=<Modifier.NONE: 'none'>)}), hnsw_config=HnswConfig(m=16, ef_construct=100, full_scan_threshold=10000, max_indexing_threads=0, on_disk=False, payload_m=None), optimizer_config=OptimizersConfig(deleted_threshold=0.2, vacuum_min_vector_number=1000, default_segment_number=0, max_segment_size=None, memmap_threshold=None, indexing_threshold=20000, flush_interval_sec=5, max_optimization_threads=None), wal_config=WalConfig(wal_capacity_mb=32, wal_segments_ahead=0), quantization_config=None, strict_mode_config=StrictModeConfigOutput(enabled=False, max_query_limit=None, max_timeout=None, unindexed_filtering_retrieve=None, unindexed_filtering_update=None, search_max_hnsw_ef=None, search_allow_exact=None, search_max_oversampling=None, upsert_max_batchsize=None, max_collection_vector_size_bytes=None, read_rate_limit=None, write_rate_limit=None, max_collection_payload_size_bytes=None, max_points_count=None, filter_max_conditions=None, condition_max_size=None, multivector_config=None, sparse_config=None)) payload_schema={}
-----
Testing with modifier: none
success rate:  75.00%  (adding more points...)
success rate:  55.00%  (adding more points...)
success rate:  50.00%  (adding more points...)
success rate:  35.00%  (adding more points...)
success rate:  40.00%  (adding more points...)
success rate:  55.00%  (adding more points...)
success rate:  45.00%  (adding more points...)
success rate:  45.00%  (adding more points...)
success rate:  30.00%  (adding more points...)
success rate:  50.00%  (adding more points...)
success rate:  30.00%  (adding more points...)
success rate:  15.00%  (deleting points...)
success rate:  85.00%  (deleting points...)
success rate:  80.00%  (deleting points...)
success rate:  80.00%  (deleting points...)
success rate:  85.00%  (deleting points...)
success rate:  90.00%  (deleting points...)
success rate:  75.00%  (deleting points...)
success rate:  90.00%  (deleting points...)
success rate:  90.00%  (deleting points...)
-----
title='qdrant - vector search engine' version='1.14.1' commit='530430fac2a3ca872504f276d2c91a5c91f43fa0'
status=<CollectionStatus.YELLOW: 'yellow'> optimizer_status=<OptimizersStatusOneOf.OK: 'ok'> vectors_count=None indexed_vectors_count=6080 points_count=0 segments_count=10 config=CollectionConfig(params=CollectionParams(vectors={}, shard_number=1, sharding_method=None, replication_factor=1, write_consistency_factor=1, read_fan_out_factor=None, on_disk_payload=True, sparse_vectors={'sparse': SparseVectorParams(index=None, modifier=<Modifier.NONE: 'none'>)}), hnsw_config=HnswConfig(m=16, ef_construct=100, full_scan_threshold=10000, max_indexing_threads=0, on_disk=False, payload_m=None), optimizer_config=OptimizersConfig(deleted_threshold=0.2, vacuum_min_vector_number=1000, default_segment_number=0, max_segment_size=None, memmap_threshold=None, indexing_threshold=20000, flush_interval_sec=5, max_optimization_threads=None), wal_config=WalConfig(wal_capacity_mb=32, wal_segments_ahead=0), quantization_config=None, strict_mode_config=StrictModeConfigOutput(enabled=False, max_query_limit=None, max_timeout=None, unindexed_filtering_retrieve=None, unindexed_filtering_update=None, search_max_hnsw_ef=None, search_allow_exact=None, search_max_oversampling=None, upsert_max_batchsize=None, max_collection_vector_size_bytes=None, read_rate_limit=None, write_rate_limit=None, max_collection_payload_size_bytes=None, max_points_count=None, filter_max_conditions=None, condition_max_size=None, multivector_config=None, sparse_config=None)) payload_schema={}
-----
Deleting collection: 'test'.
Creating collection: 'test'.
-----
title='qdrant - vector search engine' version='1.14.1' commit='530430fac2a3ca872504f276d2c91a5c91f43fa0'
status=<CollectionStatus.GREEN: 'green'> optimizer_status=<OptimizersStatusOneOf.OK: 'ok'> vectors_count=None indexed_vectors_count=0 points_count=0 segments_count=8 config=CollectionConfig(params=CollectionParams(vectors={}, shard_number=1, sharding_method=None, replication_factor=1, write_consistency_factor=1, read_fan_out_factor=None, on_disk_payload=True, sparse_vectors={'sparse': SparseVectorParams(index=None, modifier=<Modifier.IDF: 'idf'>)}), hnsw_config=HnswConfig(m=16, ef_construct=100, full_scan_threshold=10000, max_indexing_threads=0, on_disk=False, payload_m=None), optimizer_config=OptimizersConfig(deleted_threshold=0.2, vacuum_min_vector_number=1000, default_segment_number=0, max_segment_size=None, memmap_threshold=None, indexing_threshold=20000, flush_interval_sec=5, max_optimization_threads=None), wal_config=WalConfig(wal_capacity_mb=32, wal_segments_ahead=0), quantization_config=None, strict_mode_config=StrictModeConfigOutput(enabled=False, max_query_limit=None, max_timeout=None, unindexed_filtering_retrieve=None, unindexed_filtering_update=None, search_max_hnsw_ef=None, search_allow_exact=None, search_max_oversampling=None, upsert_max_batchsize=None, max_collection_vector_size_bytes=None, read_rate_limit=None, write_rate_limit=None, max_collection_payload_size_bytes=None, max_points_count=None, filter_max_conditions=None, condition_max_size=None, multivector_config=None, sparse_config=None)) payload_schema={}
-----
Testing with modifier: idf
success rate:  75.00%  (adding more points...)
success rate:  65.00%  (adding more points...)
success rate:  65.00%  (adding more points...)
success rate:  40.00%  (adding more points...)
success rate:  40.00%  (adding more points...)
success rate:  45.00%  (adding more points...)
success rate:  40.00%  (adding more points...)
success rate:  40.00%  (adding more points...)
success rate:  50.00%  (adding more points...)
success rate:  35.00%  (adding more points...)
success rate:  45.00%  (adding more points...)
success rate:  20.00%  (deleting points...)
success rate:   0.00%  (deleting points...)
success rate:   0.00%  (deleting points...)
success rate:   0.00%  (deleting points...)
success rate:   0.00%  (deleting points...)
success rate:   0.00%  (deleting points...)
success rate:   0.00%  (deleting points...)
success rate:   0.00%  (deleting points...)
success rate:   0.00%  (deleting points...)
-----
title='qdrant - vector search engine' version='1.14.1' commit='530430fac2a3ca872504f276d2c91a5c91f43fa0'
status=<CollectionStatus.YELLOW: 'yellow'> optimizer_status=<OptimizersStatusOneOf.OK: 'ok'> vectors_count=None indexed_vectors_count=4105 points_count=0 segments_count=10 config=CollectionConfig(params=CollectionParams(vectors={}, shard_number=1, sharding_method=None, replication_factor=1, write_consistency_factor=1, read_fan_out_factor=None, on_disk_payload=True, sparse_vectors={'sparse': SparseVectorParams(index=None, modifier=<Modifier.IDF: 'idf'>)}), hnsw_config=HnswConfig(m=16, ef_construct=100, full_scan_threshold=10000, max_indexing_threads=0, on_disk=False, payload_m=None), optimizer_config=OptimizersConfig(deleted_threshold=0.2, vacuum_min_vector_number=1000, default_segment_number=0, max_segment_size=None, memmap_threshold=None, indexing_threshold=20000, flush_interval_sec=5, max_optimization_threads=None), wal_config=WalConfig(wal_capacity_mb=32, wal_segments_ahead=0), quantization_config=None, strict_mode_config=StrictModeConfigOutput(enabled=False, max_query_limit=None, max_timeout=None, unindexed_filtering_retrieve=None, unindexed_filtering_update=None, search_max_hnsw_ef=None, search_allow_exact=None, search_max_oversampling=None, upsert_max_batchsize=None, max_collection_vector_size_bytes=None, read_rate_limit=None, write_rate_limit=None, max_collection_payload_size_bytes=None, max_points_count=None, filter_max_conditions=None, condition_max_size=None, multivector_config=None, sparse_config=None)) payload_schema={}
-----

Expected Behavior

The IDF accuracy is a little better than the non-IDF. When adding more points (noise) accuracy should trend down of course. But when points (noise) are deleted accuracy should go back up, it should not go to 0%.

Possible Solution

Context (Environment)

Any dynamic content setup.

Detailed Description

Possible Implementation

Metadata

Metadata

Assignees

Labels

bugSomething isn't working

Type

No type

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions