Spaces:
Runtime error
Runtime error
ej68okap
commited on
Commit
Β·
a53d884
1
Parent(s):
9832882
new code added
Browse files- milvus_manager.py +68 -6
milvus_manager.py
CHANGED
|
@@ -99,17 +99,17 @@ class MilvusManager:
|
|
| 99 |
self.client.create_index(
|
| 100 |
collection_name=self.collection_name, index_params=index_params, sync=True
|
| 101 |
)
|
| 102 |
-
|
| 103 |
-
def search(self, data, topk):
|
| 104 |
"""
|
| 105 |
-
Search for the top-k most similar vectors in the collection.
|
| 106 |
|
| 107 |
Args:
|
| 108 |
data (array-like): Query vector.
|
| 109 |
topk (int): Number of top results to return.
|
|
|
|
| 110 |
|
| 111 |
Returns:
|
| 112 |
-
list: Sorted list of top-k results.
|
| 113 |
"""
|
| 114 |
search_params = {"metric_type": "IP", "params": {}} # Search parameters for Inner Product
|
| 115 |
results = self.client.search(
|
|
@@ -155,9 +155,71 @@ class MilvusManager:
|
|
| 155 |
score, doc_id = future.result()
|
| 156 |
scores.append((score, doc_id))
|
| 157 |
|
|
|
|
|
|
|
|
|
|
| 158 |
# Sort scores in descending order and return the top-k results
|
| 159 |
-
|
| 160 |
-
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 161 |
|
| 162 |
def insert(self, data):
|
| 163 |
"""
|
|
|
|
| 99 |
self.client.create_index(
|
| 100 |
collection_name=self.collection_name, index_params=index_params, sync=True
|
| 101 |
)
|
| 102 |
+
def search(self, data, topk, threshold=0.7):
|
|
|
|
| 103 |
"""
|
| 104 |
+
Search for the top-k most similar vectors in the collection, filtered by a relevance threshold.
|
| 105 |
|
| 106 |
Args:
|
| 107 |
data (array-like): Query vector.
|
| 108 |
topk (int): Number of top results to return.
|
| 109 |
+
threshold (float): Minimum score threshold for relevance (default is 0.5).
|
| 110 |
|
| 111 |
Returns:
|
| 112 |
+
list: Sorted list of top-k results that meet the threshold.
|
| 113 |
"""
|
| 114 |
search_params = {"metric_type": "IP", "params": {}} # Search parameters for Inner Product
|
| 115 |
results = self.client.search(
|
|
|
|
| 155 |
score, doc_id = future.result()
|
| 156 |
scores.append((score, doc_id))
|
| 157 |
|
| 158 |
+
# Filter scores by threshold
|
| 159 |
+
filtered_scores = [item for item in scores if item[0] >= threshold]
|
| 160 |
+
|
| 161 |
# Sort scores in descending order and return the top-k results
|
| 162 |
+
filtered_scores.sort(key=lambda x: x[0], reverse=True)
|
| 163 |
+
return filtered_scores[:topk] if len(filtered_scores) >= topk else filtered_scores
|
| 164 |
+
|
| 165 |
+
# def search(self, data, topk):
|
| 166 |
+
# """
|
| 167 |
+
# Search for the top-k most similar vectors in the collection.
|
| 168 |
+
|
| 169 |
+
# Args:
|
| 170 |
+
# data (array-like): Query vector.
|
| 171 |
+
# topk (int): Number of top results to return.
|
| 172 |
+
|
| 173 |
+
# Returns:
|
| 174 |
+
# list: Sorted list of top-k results.
|
| 175 |
+
# """
|
| 176 |
+
# search_params = {"metric_type": "IP", "params": {}} # Search parameters for Inner Product
|
| 177 |
+
# results = self.client.search(
|
| 178 |
+
# self.collection_name,
|
| 179 |
+
# data,
|
| 180 |
+
# limit=50, # Initial retrieval limit
|
| 181 |
+
# output_fields=["vector", "seq_id", "doc_id"], # Fields to include in the output
|
| 182 |
+
# search_params=search_params,
|
| 183 |
+
# )
|
| 184 |
+
|
| 185 |
+
# # Collect unique document IDs from the search results
|
| 186 |
+
# doc_ids = set()
|
| 187 |
+
# for r_id in range(len(results)):
|
| 188 |
+
# for r in range(len(results[r_id])):
|
| 189 |
+
# doc_ids.add(results[r_id][r]["entity"]["doc_id"])
|
| 190 |
+
|
| 191 |
+
# scores = []
|
| 192 |
+
|
| 193 |
+
# # Function to rerank a single document based on its relevance to the query
|
| 194 |
+
# def rerank_single_doc(doc_id, data, client, collection_name):
|
| 195 |
+
# doc_colbert_vecs = client.query(
|
| 196 |
+
# collection_name=collection_name,
|
| 197 |
+
# filter=f"doc_id in [{doc_id}, {doc_id + 1}]", # Query documents by ID
|
| 198 |
+
# output_fields=["seq_id", "vector", "doc"], # Fields to retrieve
|
| 199 |
+
# limit=1000, # Retrieve a maximum of 1000 vectors per document
|
| 200 |
+
# )
|
| 201 |
+
# # Compute the maximum similarity score for the document
|
| 202 |
+
# doc_vecs = np.vstack(
|
| 203 |
+
# [doc_colbert_vecs[i]["vector"] for i in range(len(doc_colbert_vecs))]
|
| 204 |
+
# )
|
| 205 |
+
# score = np.dot(data, doc_vecs.T).max(1).sum()
|
| 206 |
+
# return (score, doc_id)
|
| 207 |
+
|
| 208 |
+
# # Use multithreading to rerank documents in parallel
|
| 209 |
+
# with concurrent.futures.ThreadPoolExecutor(max_workers=300) as executor:
|
| 210 |
+
# futures = {
|
| 211 |
+
# executor.submit(
|
| 212 |
+
# rerank_single_doc, doc_id, data, self.client, self.collection_name
|
| 213 |
+
# ): doc_id
|
| 214 |
+
# for doc_id in doc_ids
|
| 215 |
+
# }
|
| 216 |
+
# for future in concurrent.futures.as_completed(futures):
|
| 217 |
+
# score, doc_id = future.result()
|
| 218 |
+
# scores.append((score, doc_id))
|
| 219 |
+
|
| 220 |
+
# # Sort scores in descending order and return the top-k results
|
| 221 |
+
# scores.sort(key=lambda x: x[0], reverse=True)
|
| 222 |
+
# return scores[:topk] if len(scores) >= topk else scores
|
| 223 |
|
| 224 |
def insert(self, data):
|
| 225 |
"""
|