HayStack docstore and hybrid context in Pufendorf bot.
Browse files
app.py
CHANGED
|
@@ -91,6 +91,10 @@ except Exception as e: # chromadb.errors.InvalidCollectionException:
|
|
| 91 |
print("ERROR, no db")
|
| 92 |
collection = None
|
| 93 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 94 |
|
| 95 |
# Contact OpenAI "moderator".
|
| 96 |
def moderator(message):
|
|
@@ -129,6 +133,12 @@ def get_context(message):
|
|
| 129 |
return data
|
| 130 |
|
| 131 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 132 |
def extract_persons(a_text) -> str:
|
| 133 |
print(a_text)
|
| 134 |
system_prompt = (
|
|
@@ -324,6 +334,16 @@ with gr.Blocks(theme=theme) as demo_blocks:
|
|
| 324 |
return
|
| 325 |
|
| 326 |
context = get_context(user_message)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 327 |
DBG("FULL CONTEXT")
|
| 328 |
for x in context:
|
| 329 |
DBG(x)
|
|
@@ -340,8 +360,21 @@ with gr.Blocks(theme=theme) as demo_blocks:
|
|
| 340 |
context_str = "Context:\n"
|
| 341 |
for i, x in enumerate(context): # note different after reranking
|
| 342 |
DBG(x)
|
| 343 |
-
context_str +=
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 344 |
ctx_text = context_str
|
|
|
|
| 345 |
prompt = f"Context: {context_str}\nQuestion:{user_message}\n"
|
| 346 |
else:
|
| 347 |
ctx_text = "(no retrieved context used)"
|
|
@@ -375,6 +408,8 @@ with gr.Blocks(theme=theme) as demo_blocks:
|
|
| 375 |
messages += history[:-1] # because the prompt has the context.
|
| 376 |
## Truncate the messages when too many?
|
| 377 |
messages.append({"role": "user", "content": prompt}) ## should be ChatMessage
|
|
|
|
|
|
|
| 378 |
# format_history(messages)
|
| 379 |
# print("=" * 40)
|
| 380 |
# print(messages)
|
|
@@ -420,6 +455,4 @@ with gr.Blocks(theme=theme) as demo_blocks:
|
|
| 420 |
# demo.launch(share=True)
|
| 421 |
if __name__ == "__main__":
|
| 422 |
print("Starting")
|
| 423 |
-
doc_store = InMemoryDocumentStore().load_from_disk("pufendorfdocs.store")
|
| 424 |
-
print(f"Number of documents: {doc_store.count_documents()}.")
|
| 425 |
demo_blocks.launch()
|
|
|
|
| 91 |
print("ERROR, no db")
|
| 92 |
collection = None
|
| 93 |
|
| 94 |
+
doc_store = InMemoryDocumentStore().load_from_disk("pufendorfdocs.store")
|
| 95 |
+
print(f"Number of documents: {doc_store.count_documents()}.")
|
| 96 |
+
hybrid_retrieval = create_hybrid_retriever(doc_store)
|
| 97 |
+
|
| 98 |
|
| 99 |
# Contact OpenAI "moderator".
|
| 100 |
def moderator(message):
|
|
|
|
| 133 |
return data
|
| 134 |
|
| 135 |
|
| 136 |
+
# Hybrid retriever from hybrid, uses pufendorfstore.
|
| 137 |
+
def get_hybrid_context(message):
|
| 138 |
+
documents = retrieve(hybrid_retrieval, message, top_k=3, scale=True)
|
| 139 |
+
return documents
|
| 140 |
+
|
| 141 |
+
|
| 142 |
def extract_persons(a_text) -> str:
|
| 143 |
print(a_text)
|
| 144 |
system_prompt = (
|
|
|
|
| 334 |
return
|
| 335 |
|
| 336 |
context = get_context(user_message)
|
| 337 |
+
hybrid_context = get_hybrid_context(user_message)
|
| 338 |
+
for hc in hybrid_context:
|
| 339 |
+
DBG(
|
| 340 |
+
str(hc.meta["file_path"])
|
| 341 |
+
+ " "
|
| 342 |
+
+ str(hc.meta["page_number"])
|
| 343 |
+
+ "/"
|
| 344 |
+
+ str(hc.content)
|
| 345 |
+
)
|
| 346 |
+
|
| 347 |
DBG("FULL CONTEXT")
|
| 348 |
for x in context:
|
| 349 |
DBG(x)
|
|
|
|
| 360 |
context_str = "Context:\n"
|
| 361 |
for i, x in enumerate(context): # note different after reranking
|
| 362 |
DBG(x)
|
| 363 |
+
context_str += x + "\n\n"
|
| 364 |
+
# The hc is the new haystack contents.
|
| 365 |
+
hybridkeep = os.getenv("HYBRIDKEEP")
|
| 366 |
+
if not hybridkeep:
|
| 367 |
+
hybridkeep = 3
|
| 368 |
+
else:
|
| 369 |
+
hybridkeep = int(hybridkeep)
|
| 370 |
+
DBG("hybrid context keep: " + str(hybridkeep))
|
| 371 |
+
if hybridkeep > 0:
|
| 372 |
+
hybrid_context = hybrid_context[0:hybridkeep]
|
| 373 |
+
for i, x in enumerate(hybrid_context):
|
| 374 |
+
DBG(x)
|
| 375 |
+
context_str += x.content + "\n\n"
|
| 376 |
ctx_text = context_str
|
| 377 |
+
if ctxkeep > 0 or hybridkeep > 0:
|
| 378 |
prompt = f"Context: {context_str}\nQuestion:{user_message}\n"
|
| 379 |
else:
|
| 380 |
ctx_text = "(no retrieved context used)"
|
|
|
|
| 408 |
messages += history[:-1] # because the prompt has the context.
|
| 409 |
## Truncate the messages when too many?
|
| 410 |
messages.append({"role": "user", "content": prompt}) ## should be ChatMessage
|
| 411 |
+
# ctx_text = str(messages)
|
| 412 |
+
# DBG(prompt)
|
| 413 |
# format_history(messages)
|
| 414 |
# print("=" * 40)
|
| 415 |
# print(messages)
|
|
|
|
| 455 |
# demo.launch(share=True)
|
| 456 |
if __name__ == "__main__":
|
| 457 |
print("Starting")
|
|
|
|
|
|
|
| 458 |
demo_blocks.launch()
|
hybrid.py
CHANGED
|
@@ -31,8 +31,7 @@ python hybrid.py -c newstore.store
|
|
| 31 |
python hybrid.py -r newstore.store -q "who is pufendorf"
|
| 32 |
"""
|
| 33 |
|
| 34 |
-
|
| 35 |
-
embedding_model = "sentence-transformers/all-MiniLM-L12-v2"
|
| 36 |
|
| 37 |
# see https://huggingface.co/BAAI/bge-m3
|
| 38 |
reranker_model = "BAAI/bge-reranker-base"
|
|
|
|
| 31 |
python hybrid.py -r newstore.store -q "who is pufendorf"
|
| 32 |
"""
|
| 33 |
|
| 34 |
+
embedding_model = "sentence-transformers/all-MiniLM-L6-v2"
|
|
|
|
| 35 |
|
| 36 |
# see https://huggingface.co/BAAI/bge-m3
|
| 37 |
reranker_model = "BAAI/bge-reranker-base"
|
vector3_db/a1b2bf9f-4f30-46a6-a6c2-b6ca99effce9/data_level0.bin
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 16760000
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a2f33a640a6a1c9930a46cf5a6b6ebc0e07c52d85a754892208b3725ec6d7964
|
| 3 |
size 16760000
|
vector3_db/a1b2bf9f-4f30-46a6-a6c2-b6ca99effce9/length.bin
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 40000
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:174a07871bf6956d282f718bf1af45ecb44ff58d1120450e65572884e2655044
|
| 3 |
size 40000
|
vector3_db/chroma.sqlite3
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 11452416
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2a4d40287f44cd70d89a2167703709b18734676b57e399607ebef5145003eda0
|
| 3 |
size 11452416
|