-
Notifications
You must be signed in to change notification settings - Fork 732
Closed
Milestone
Description
Minimal example below. I suspect something weird is happening to multithreading, but was not able to confirm or find a resolution.
Python version - 3.9.1
OS - Mac Big Sur
import os
import tempfile
import pandas as pd
from txtai.tokenizer import Tokenizer
from txtai.vectors import WordVectors
from txtai.embeddings import Embeddings
# generate random data
random_array = ["a" + pd.util.testing.rands_array(3, 2) for i in range(100)]
df = pd.DataFrame(random_array, columns=list("AB"))
with tempfile.NamedTemporaryFile(mode="w", suffix=".txt", delete=False) as output:
tokens = output.name
for row in range(df.shape[0]):
output.write(" ".join(Tokenizer.tokenize(df.iloc[row, 0])) + "\n")
WordVectors.build(tokens, 10, 1, "search-data")
os.remove(tokens)
def stream():
for row in range(df.shape[0]):
uid = row
tokens = Tokenizer.tokenize(df.iloc[row, 0])
document = (uid, tokens, None)
yield document
embeddings = Embeddings({"path": "search-data.magnitude"})
# Code hangs here
embeddings.index(stream())
Metadata
Metadata
Assignees
Labels
No labels