-
Notifications
You must be signed in to change notification settings - Fork 1.8k
Open
Labels
bugSomething isn't workingSomething isn't working
Description
Current Behavior
With modifier=models.Modifier.NONE
When more vectors (noise) are added the accuracy trends down, as expected.
When vectors are deleted, the accuracy goes back up.
With modifier=models.Modifier.IDF
When more vectors (noise) are added the accuracy trends down, as expected.
When vectors are deleted, the accuracy drops a lot.
Steps to Reproduce
import asyncio
import uuid
from qdrant_client import AsyncQdrantClient, models
from fastembed import SparseTextEmbedding
import string
import random
coll = "test"
vec = "sparse"
loops = 20
def get_doc() -> str:
words = [ch for ch in string.ascii_letters + string.digits]
n = len(words)
# distribution 'middle'
mid_point = (n - 1) / 2.0
weights = [n - abs(i - mid_point) for i in range(n)]
k = max(1, int(random.gauss(20, 5)))
chosen = random.choices(words, weights=weights, k=k)
return ' '.join(chosen)
async def initialize(client: AsyncQdrantClient, mod: models.Modifier):
vec_config = {
vec: models.SparseVectorParams(
modifier=mod,
),
}
exists = await client.collection_exists(collection_name=coll)
if exists:
print(f"Deleting collection: '{coll}'.")
await client.delete_collection(
collection_name=coll,
)
print(f"Creating collection: '{coll}'.")
await client.create_collection(
collection_name=coll,
sparse_vectors_config=vec_config,
)
return
async def upsert(client: AsyncQdrantClient, bm25: SparseTextEmbedding, id: uuid.UUID, text: str):
emb = list(bm25.embed(text))[0].as_object()
await client.upsert(
collection_name=coll,
points=[
models.PointStruct(
id=str(id),
vector={vec: emb},
)
],
wait=True,
)
async def delete(client: AsyncQdrantClient, ids: list[uuid.UUID]):
await client.delete(
collection_name=coll,
points_selector=models.PointIdsList(
points=[str(id) for id in ids],
),
wait=True,
)
async def search(client: AsyncQdrantClient, bm25: SparseTextEmbedding, query: str):
emb = list(bm25.embed(query))[0].as_object()
nearest = await client.query_points(
collection_name=coll,
query=models.SparseVector(**emb),
using=vec,
limit=1,
)
if not nearest.points:
return ""
return str(nearest.points[0].id)
async def print_info(client: AsyncQdrantClient):
print("-----")
info = await client.info()
print(
info
) # title='qdrant - vector search engine' version='1.14.1' commit='530430fac2a3ca872504f276d2c91a5c91f43fa0'
info = await client.get_collection(coll)
print(info)
print("-----")
async def main(mod: models.Modifier):
client = AsyncQdrantClient(url="http://localhost:6333")
bm25 = SparseTextEmbedding(model_name="Qdrant/bm25", local_files_only=True)
await initialize(client, mod)
await print_info(client)
print(f"Testing with modifier: {mod}")
ids = []
for h in range(loops):
sum_success = 0
for i in range(loops):
# noise
for j in range(loops):
id = uuid.uuid4()
ids.append(id)
doc = get_doc()
await upsert(client, bm25, id, doc)
# search test
query = " ".join(random.choices(doc.split(), k=15))
r = await search(client, bm25, query)
success = int(str(id) in r)
# print(f"success: {success}, looking for: {query}, expecting: {doc}")
sum_success += success
points_action = "(adding more points...)"
is_delete = False
if loops // 2 < h:
points_action = " (deleting points...)"
is_delete = True
percentage_str = f"{100 * sum_success / loops:6.2f}%"
points_action_cleaned = points_action.lstrip()
print(f"{'success rate:':<14}{percentage_str:<9}{points_action_cleaned}")
if is_delete:
await delete(client, ids)
ids = []
await print_info(client)
if __name__ == "__main__":
asyncio.run(main(models.Modifier.NONE))
asyncio.run(main(models.Modifier.IDF))
Output
Deleting collection: 'test'.
Creating collection: 'test'.
-----
title='qdrant - vector search engine' version='1.14.1' commit='530430fac2a3ca872504f276d2c91a5c91f43fa0'
status=<CollectionStatus.GREEN: 'green'> optimizer_status=<OptimizersStatusOneOf.OK: 'ok'> vectors_count=None indexed_vectors_count=0 points_count=0 segments_count=8 config=CollectionConfig(params=CollectionParams(vectors={}, shard_number=1, sharding_method=None, replication_factor=1, write_consistency_factor=1, read_fan_out_factor=None, on_disk_payload=True, sparse_vectors={'sparse': SparseVectorParams(index=None, modifier=<Modifier.NONE: 'none'>)}), hnsw_config=HnswConfig(m=16, ef_construct=100, full_scan_threshold=10000, max_indexing_threads=0, on_disk=False, payload_m=None), optimizer_config=OptimizersConfig(deleted_threshold=0.2, vacuum_min_vector_number=1000, default_segment_number=0, max_segment_size=None, memmap_threshold=None, indexing_threshold=20000, flush_interval_sec=5, max_optimization_threads=None), wal_config=WalConfig(wal_capacity_mb=32, wal_segments_ahead=0), quantization_config=None, strict_mode_config=StrictModeConfigOutput(enabled=False, max_query_limit=None, max_timeout=None, unindexed_filtering_retrieve=None, unindexed_filtering_update=None, search_max_hnsw_ef=None, search_allow_exact=None, search_max_oversampling=None, upsert_max_batchsize=None, max_collection_vector_size_bytes=None, read_rate_limit=None, write_rate_limit=None, max_collection_payload_size_bytes=None, max_points_count=None, filter_max_conditions=None, condition_max_size=None, multivector_config=None, sparse_config=None)) payload_schema={}
-----
Testing with modifier: none
success rate: 75.00% (adding more points...)
success rate: 55.00% (adding more points...)
success rate: 50.00% (adding more points...)
success rate: 35.00% (adding more points...)
success rate: 40.00% (adding more points...)
success rate: 55.00% (adding more points...)
success rate: 45.00% (adding more points...)
success rate: 45.00% (adding more points...)
success rate: 30.00% (adding more points...)
success rate: 50.00% (adding more points...)
success rate: 30.00% (adding more points...)
success rate: 15.00% (deleting points...)
success rate: 85.00% (deleting points...)
success rate: 80.00% (deleting points...)
success rate: 80.00% (deleting points...)
success rate: 85.00% (deleting points...)
success rate: 90.00% (deleting points...)
success rate: 75.00% (deleting points...)
success rate: 90.00% (deleting points...)
success rate: 90.00% (deleting points...)
-----
title='qdrant - vector search engine' version='1.14.1' commit='530430fac2a3ca872504f276d2c91a5c91f43fa0'
status=<CollectionStatus.YELLOW: 'yellow'> optimizer_status=<OptimizersStatusOneOf.OK: 'ok'> vectors_count=None indexed_vectors_count=6080 points_count=0 segments_count=10 config=CollectionConfig(params=CollectionParams(vectors={}, shard_number=1, sharding_method=None, replication_factor=1, write_consistency_factor=1, read_fan_out_factor=None, on_disk_payload=True, sparse_vectors={'sparse': SparseVectorParams(index=None, modifier=<Modifier.NONE: 'none'>)}), hnsw_config=HnswConfig(m=16, ef_construct=100, full_scan_threshold=10000, max_indexing_threads=0, on_disk=False, payload_m=None), optimizer_config=OptimizersConfig(deleted_threshold=0.2, vacuum_min_vector_number=1000, default_segment_number=0, max_segment_size=None, memmap_threshold=None, indexing_threshold=20000, flush_interval_sec=5, max_optimization_threads=None), wal_config=WalConfig(wal_capacity_mb=32, wal_segments_ahead=0), quantization_config=None, strict_mode_config=StrictModeConfigOutput(enabled=False, max_query_limit=None, max_timeout=None, unindexed_filtering_retrieve=None, unindexed_filtering_update=None, search_max_hnsw_ef=None, search_allow_exact=None, search_max_oversampling=None, upsert_max_batchsize=None, max_collection_vector_size_bytes=None, read_rate_limit=None, write_rate_limit=None, max_collection_payload_size_bytes=None, max_points_count=None, filter_max_conditions=None, condition_max_size=None, multivector_config=None, sparse_config=None)) payload_schema={}
-----
Deleting collection: 'test'.
Creating collection: 'test'.
-----
title='qdrant - vector search engine' version='1.14.1' commit='530430fac2a3ca872504f276d2c91a5c91f43fa0'
status=<CollectionStatus.GREEN: 'green'> optimizer_status=<OptimizersStatusOneOf.OK: 'ok'> vectors_count=None indexed_vectors_count=0 points_count=0 segments_count=8 config=CollectionConfig(params=CollectionParams(vectors={}, shard_number=1, sharding_method=None, replication_factor=1, write_consistency_factor=1, read_fan_out_factor=None, on_disk_payload=True, sparse_vectors={'sparse': SparseVectorParams(index=None, modifier=<Modifier.IDF: 'idf'>)}), hnsw_config=HnswConfig(m=16, ef_construct=100, full_scan_threshold=10000, max_indexing_threads=0, on_disk=False, payload_m=None), optimizer_config=OptimizersConfig(deleted_threshold=0.2, vacuum_min_vector_number=1000, default_segment_number=0, max_segment_size=None, memmap_threshold=None, indexing_threshold=20000, flush_interval_sec=5, max_optimization_threads=None), wal_config=WalConfig(wal_capacity_mb=32, wal_segments_ahead=0), quantization_config=None, strict_mode_config=StrictModeConfigOutput(enabled=False, max_query_limit=None, max_timeout=None, unindexed_filtering_retrieve=None, unindexed_filtering_update=None, search_max_hnsw_ef=None, search_allow_exact=None, search_max_oversampling=None, upsert_max_batchsize=None, max_collection_vector_size_bytes=None, read_rate_limit=None, write_rate_limit=None, max_collection_payload_size_bytes=None, max_points_count=None, filter_max_conditions=None, condition_max_size=None, multivector_config=None, sparse_config=None)) payload_schema={}
-----
Testing with modifier: idf
success rate: 75.00% (adding more points...)
success rate: 65.00% (adding more points...)
success rate: 65.00% (adding more points...)
success rate: 40.00% (adding more points...)
success rate: 40.00% (adding more points...)
success rate: 45.00% (adding more points...)
success rate: 40.00% (adding more points...)
success rate: 40.00% (adding more points...)
success rate: 50.00% (adding more points...)
success rate: 35.00% (adding more points...)
success rate: 45.00% (adding more points...)
success rate: 20.00% (deleting points...)
success rate: 0.00% (deleting points...)
success rate: 0.00% (deleting points...)
success rate: 0.00% (deleting points...)
success rate: 0.00% (deleting points...)
success rate: 0.00% (deleting points...)
success rate: 0.00% (deleting points...)
success rate: 0.00% (deleting points...)
success rate: 0.00% (deleting points...)
-----
title='qdrant - vector search engine' version='1.14.1' commit='530430fac2a3ca872504f276d2c91a5c91f43fa0'
status=<CollectionStatus.YELLOW: 'yellow'> optimizer_status=<OptimizersStatusOneOf.OK: 'ok'> vectors_count=None indexed_vectors_count=4105 points_count=0 segments_count=10 config=CollectionConfig(params=CollectionParams(vectors={}, shard_number=1, sharding_method=None, replication_factor=1, write_consistency_factor=1, read_fan_out_factor=None, on_disk_payload=True, sparse_vectors={'sparse': SparseVectorParams(index=None, modifier=<Modifier.IDF: 'idf'>)}), hnsw_config=HnswConfig(m=16, ef_construct=100, full_scan_threshold=10000, max_indexing_threads=0, on_disk=False, payload_m=None), optimizer_config=OptimizersConfig(deleted_threshold=0.2, vacuum_min_vector_number=1000, default_segment_number=0, max_segment_size=None, memmap_threshold=None, indexing_threshold=20000, flush_interval_sec=5, max_optimization_threads=None), wal_config=WalConfig(wal_capacity_mb=32, wal_segments_ahead=0), quantization_config=None, strict_mode_config=StrictModeConfigOutput(enabled=False, max_query_limit=None, max_timeout=None, unindexed_filtering_retrieve=None, unindexed_filtering_update=None, search_max_hnsw_ef=None, search_allow_exact=None, search_max_oversampling=None, upsert_max_batchsize=None, max_collection_vector_size_bytes=None, read_rate_limit=None, write_rate_limit=None, max_collection_payload_size_bytes=None, max_points_count=None, filter_max_conditions=None, condition_max_size=None, multivector_config=None, sparse_config=None)) payload_schema={}
-----
Expected Behavior
The IDF accuracy is a little better than the non-IDF. When adding more points (noise) accuracy should trend down of course. But when points (noise) are deleted accuracy should go back up, it should not go to 0%.
Possible Solution
Context (Environment)
Any dynamic content setup.
Detailed Description
Possible Implementation
Metadata
Metadata
Assignees
Labels
bugSomething isn't workingSomething isn't working