The sparse vectors modifier IDF is not being correctly updated when points are deleted

## Current Behavior


With `modifier=models.Modifier.NONE`
When more vectors (noise) are added the accuracy trends down, as expected.
When vectors are deleted, the accuracy *goes back up*.

With `modifier=models.Modifier.IDF`
When more vectors (noise) are added the accuracy trends down, as expected.
When vectors are deleted, the accuracy *drops a lot*.

## Steps to Reproduce


```
import asyncio
import uuid
from qdrant_client import AsyncQdrantClient, models
from fastembed import SparseTextEmbedding
import string
import random

coll = "test"
vec = "sparse"
loops = 20

def get_doc() -> str:
    words = [ch for ch in string.ascii_letters + string.digits]
    n = len(words)

    # distribution 'middle'
    mid_point = (n - 1) / 2.0
    weights = [n - abs(i - mid_point) for i in range(n)]

    k = max(1, int(random.gauss(20, 5)))
    
    chosen = random.choices(words, weights=weights, k=k)
    return ' '.join(chosen)

async def initialize(client: AsyncQdrantClient, mod: models.Modifier):
    vec_config = {
        vec: models.SparseVectorParams(
            modifier=mod,
        ),
    }

    exists = await client.collection_exists(collection_name=coll)
    if exists:
        print(f"Deleting collection: '{coll}'.")
        await client.delete_collection(
            collection_name=coll,
        )

    print(f"Creating collection: '{coll}'.")
    
    await client.create_collection(
        collection_name=coll,
        sparse_vectors_config=vec_config,
    )

    return


async def upsert(client: AsyncQdrantClient, bm25: SparseTextEmbedding, id: uuid.UUID, text: str):
    
    emb = list(bm25.embed(text))[0].as_object()

    await client.upsert(
        collection_name=coll,
        points=[
            models.PointStruct(
                id=str(id),
                vector={vec: emb},
            )
        ],
        wait=True,
    )


async def delete(client: AsyncQdrantClient, ids: list[uuid.UUID]):
    await client.delete(
        collection_name=coll,
        points_selector=models.PointIdsList(
            points=[str(id) for id in ids],
        ),
        wait=True,
    )


async def search(client: AsyncQdrantClient, bm25: SparseTextEmbedding, query: str):
    
    emb = list(bm25.embed(query))[0].as_object()

    nearest = await client.query_points(
        collection_name=coll,
        query=models.SparseVector(**emb),
        using=vec,
        limit=1,
    )
    if not nearest.points:
        return ""

    return str(nearest.points[0].id)


async def print_info(client: AsyncQdrantClient):
    print("-----")
    info = await client.info()
    print(
        info
    )  # title='qdrant - vector search engine' version='1.14.1' commit='530430fac2a3ca872504f276d2c91a5c91f43fa0'
    info = await client.get_collection(coll)
    print(info)
    print("-----")


async def main(mod: models.Modifier):
    client = AsyncQdrantClient(url="http://localhost:6333")
    bm25 = SparseTextEmbedding(model_name="Qdrant/bm25", local_files_only=True)

    await initialize(client, mod)
    await print_info(client)
    print(f"Testing with modifier: {mod}")

    ids = []
    for h in range(loops):
        sum_success = 0
        for i in range(loops):
            # noise
            for j in range(loops):
                id = uuid.uuid4()
                ids.append(id)
                doc = get_doc()
                await upsert(client, bm25, id, doc)

            # search test
            query = " ".join(random.choices(doc.split(), k=15))
            r = await search(client, bm25, query)
            success = int(str(id) in r)
            # print(f"success: {success}, looking for: {query}, expecting: {doc}")
            
            sum_success += success

        points_action = "(adding more points...)"
        is_delete = False
        if loops // 2 < h:
            points_action = " (deleting points...)"
            is_delete = True

        percentage_str = f"{100 * sum_success / loops:6.2f}%"
        points_action_cleaned = points_action.lstrip()
        print(f"{'success rate:':<14}{percentage_str:<9}{points_action_cleaned}")
        
        if is_delete:
            await delete(client, ids)
            ids = []        

    await print_info(client)


if __name__ == "__main__":
    asyncio.run(main(models.Modifier.NONE))
    asyncio.run(main(models.Modifier.IDF))

```


Output
```
Deleting collection: 'test'.
Creating collection: 'test'.
-----
title='qdrant - vector search engine' version='1.14.1' commit='530430fac2a3ca872504f276d2c91a5c91f43fa0'
status=<CollectionStatus.GREEN: 'green'> optimizer_status=<OptimizersStatusOneOf.OK: 'ok'> vectors_count=None indexed_vectors_count=0 points_count=0 segments_count=8 config=CollectionConfig(params=CollectionParams(vectors={}, shard_number=1, sharding_method=None, replication_factor=1, write_consistency_factor=1, read_fan_out_factor=None, on_disk_payload=True, sparse_vectors={'sparse': SparseVectorParams(index=None, modifier=<Modifier.NONE: 'none'>)}), hnsw_config=HnswConfig(m=16, ef_construct=100, full_scan_threshold=10000, max_indexing_threads=0, on_disk=False, payload_m=None), optimizer_config=OptimizersConfig(deleted_threshold=0.2, vacuum_min_vector_number=1000, default_segment_number=0, max_segment_size=None, memmap_threshold=None, indexing_threshold=20000, flush_interval_sec=5, max_optimization_threads=None), wal_config=WalConfig(wal_capacity_mb=32, wal_segments_ahead=0), quantization_config=None, strict_mode_config=StrictModeConfigOutput(enabled=False, max_query_limit=None, max_timeout=None, unindexed_filtering_retrieve=None, unindexed_filtering_update=None, search_max_hnsw_ef=None, search_allow_exact=None, search_max_oversampling=None, upsert_max_batchsize=None, max_collection_vector_size_bytes=None, read_rate_limit=None, write_rate_limit=None, max_collection_payload_size_bytes=None, max_points_count=None, filter_max_conditions=None, condition_max_size=None, multivector_config=None, sparse_config=None)) payload_schema={}
-----
Testing with modifier: none
success rate:  75.00%  (adding more points...)
success rate:  55.00%  (adding more points...)
success rate:  50.00%  (adding more points...)
success rate:  35.00%  (adding more points...)
success rate:  40.00%  (adding more points...)
success rate:  55.00%  (adding more points...)
success rate:  45.00%  (adding more points...)
success rate:  45.00%  (adding more points...)
success rate:  30.00%  (adding more points...)
success rate:  50.00%  (adding more points...)
success rate:  30.00%  (adding more points...)
success rate:  15.00%  (deleting points...)
success rate:  85.00%  (deleting points...)
success rate:  80.00%  (deleting points...)
success rate:  80.00%  (deleting points...)
success rate:  85.00%  (deleting points...)
success rate:  90.00%  (deleting points...)
success rate:  75.00%  (deleting points...)
success rate:  90.00%  (deleting points...)
success rate:  90.00%  (deleting points...)
-----
title='qdrant - vector search engine' version='1.14.1' commit='530430fac2a3ca872504f276d2c91a5c91f43fa0'
status=<CollectionStatus.YELLOW: 'yellow'> optimizer_status=<OptimizersStatusOneOf.OK: 'ok'> vectors_count=None indexed_vectors_count=6080 points_count=0 segments_count=10 config=CollectionConfig(params=CollectionParams(vectors={}, shard_number=1, sharding_method=None, replication_factor=1, write_consistency_factor=1, read_fan_out_factor=None, on_disk_payload=True, sparse_vectors={'sparse': SparseVectorParams(index=None, modifier=<Modifier.NONE: 'none'>)}), hnsw_config=HnswConfig(m=16, ef_construct=100, full_scan_threshold=10000, max_indexing_threads=0, on_disk=False, payload_m=None), optimizer_config=OptimizersConfig(deleted_threshold=0.2, vacuum_min_vector_number=1000, default_segment_number=0, max_segment_size=None, memmap_threshold=None, indexing_threshold=20000, flush_interval_sec=5, max_optimization_threads=None), wal_config=WalConfig(wal_capacity_mb=32, wal_segments_ahead=0), quantization_config=None, strict_mode_config=StrictModeConfigOutput(enabled=False, max_query_limit=None, max_timeout=None, unindexed_filtering_retrieve=None, unindexed_filtering_update=None, search_max_hnsw_ef=None, search_allow_exact=None, search_max_oversampling=None, upsert_max_batchsize=None, max_collection_vector_size_bytes=None, read_rate_limit=None, write_rate_limit=None, max_collection_payload_size_bytes=None, max_points_count=None, filter_max_conditions=None, condition_max_size=None, multivector_config=None, sparse_config=None)) payload_schema={}
-----
Deleting collection: 'test'.
Creating collection: 'test'.
-----
title='qdrant - vector search engine' version='1.14.1' commit='530430fac2a3ca872504f276d2c91a5c91f43fa0'
status=<CollectionStatus.GREEN: 'green'> optimizer_status=<OptimizersStatusOneOf.OK: 'ok'> vectors_count=None indexed_vectors_count=0 points_count=0 segments_count=8 config=CollectionConfig(params=CollectionParams(vectors={}, shard_number=1, sharding_method=None, replication_factor=1, write_consistency_factor=1, read_fan_out_factor=None, on_disk_payload=True, sparse_vectors={'sparse': SparseVectorParams(index=None, modifier=<Modifier.IDF: 'idf'>)}), hnsw_config=HnswConfig(m=16, ef_construct=100, full_scan_threshold=10000, max_indexing_threads=0, on_disk=False, payload_m=None), optimizer_config=OptimizersConfig(deleted_threshold=0.2, vacuum_min_vector_number=1000, default_segment_number=0, max_segment_size=None, memmap_threshold=None, indexing_threshold=20000, flush_interval_sec=5, max_optimization_threads=None), wal_config=WalConfig(wal_capacity_mb=32, wal_segments_ahead=0), quantization_config=None, strict_mode_config=StrictModeConfigOutput(enabled=False, max_query_limit=None, max_timeout=None, unindexed_filtering_retrieve=None, unindexed_filtering_update=None, search_max_hnsw_ef=None, search_allow_exact=None, search_max_oversampling=None, upsert_max_batchsize=None, max_collection_vector_size_bytes=None, read_rate_limit=None, write_rate_limit=None, max_collection_payload_size_bytes=None, max_points_count=None, filter_max_conditions=None, condition_max_size=None, multivector_config=None, sparse_config=None)) payload_schema={}
-----
Testing with modifier: idf
success rate:  75.00%  (adding more points...)
success rate:  65.00%  (adding more points...)
success rate:  65.00%  (adding more points...)
success rate:  40.00%  (adding more points...)
success rate:  40.00%  (adding more points...)
success rate:  45.00%  (adding more points...)
success rate:  40.00%  (adding more points...)
success rate:  40.00%  (adding more points...)
success rate:  50.00%  (adding more points...)
success rate:  35.00%  (adding more points...)
success rate:  45.00%  (adding more points...)
success rate:  20.00%  (deleting points...)
success rate:   0.00%  (deleting points...)
success rate:   0.00%  (deleting points...)
success rate:   0.00%  (deleting points...)
success rate:   0.00%  (deleting points...)
success rate:   0.00%  (deleting points...)
success rate:   0.00%  (deleting points...)
success rate:   0.00%  (deleting points...)
success rate:   0.00%  (deleting points...)
-----
title='qdrant - vector search engine' version='1.14.1' commit='530430fac2a3ca872504f276d2c91a5c91f43fa0'
status=<CollectionStatus.YELLOW: 'yellow'> optimizer_status=<OptimizersStatusOneOf.OK: 'ok'> vectors_count=None indexed_vectors_count=4105 points_count=0 segments_count=10 config=CollectionConfig(params=CollectionParams(vectors={}, shard_number=1, sharding_method=None, replication_factor=1, write_consistency_factor=1, read_fan_out_factor=None, on_disk_payload=True, sparse_vectors={'sparse': SparseVectorParams(index=None, modifier=<Modifier.IDF: 'idf'>)}), hnsw_config=HnswConfig(m=16, ef_construct=100, full_scan_threshold=10000, max_indexing_threads=0, on_disk=False, payload_m=None), optimizer_config=OptimizersConfig(deleted_threshold=0.2, vacuum_min_vector_number=1000, default_segment_number=0, max_segment_size=None, memmap_threshold=None, indexing_threshold=20000, flush_interval_sec=5, max_optimization_threads=None), wal_config=WalConfig(wal_capacity_mb=32, wal_segments_ahead=0), quantization_config=None, strict_mode_config=StrictModeConfigOutput(enabled=False, max_query_limit=None, max_timeout=None, unindexed_filtering_retrieve=None, unindexed_filtering_update=None, search_max_hnsw_ef=None, search_allow_exact=None, search_max_oversampling=None, upsert_max_batchsize=None, max_collection_vector_size_bytes=None, read_rate_limit=None, write_rate_limit=None, max_collection_payload_size_bytes=None, max_points_count=None, filter_max_conditions=None, condition_max_size=None, multivector_config=None, sparse_config=None)) payload_schema={}
-----
```

## Expected Behavior


The IDF accuracy is a little better than the non-IDF. When adding more points (noise) accuracy should trend down of course. But when points (noise) are deleted accuracy should go back up, it should not go to 0%.

## Possible Solution


## Context (Environment)


Any dynamic content setup.


## Detailed Description


## Possible Implementation

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

The sparse vectors modifier IDF is not being correctly updated when points are deleted #6735

Current Behavior

Steps to Reproduce

Expected Behavior

Possible Solution

Context (Environment)

Detailed Description

Possible Implementation

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

The sparse vectors modifier IDF is not being correctly updated when points are deleted #6735

Description

Current Behavior

Steps to Reproduce

Expected Behavior

Possible Solution

Context (Environment)

Detailed Description

Possible Implementation

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions