pberck commited on
Commit
3dc3f9b
·
1 Parent(s): 6e46575

HayStack docstore and hybrid context in Pufendorf bot.

Browse files
app.py CHANGED
@@ -91,6 +91,10 @@ except Exception as e: # chromadb.errors.InvalidCollectionException:
91
  print("ERROR, no db")
92
  collection = None
93
 
 
 
 
 
94
 
95
  # Contact OpenAI "moderator".
96
  def moderator(message):
@@ -129,6 +133,12 @@ def get_context(message):
129
  return data
130
 
131
 
 
 
 
 
 
 
132
  def extract_persons(a_text) -> str:
133
  print(a_text)
134
  system_prompt = (
@@ -324,6 +334,16 @@ with gr.Blocks(theme=theme) as demo_blocks:
324
  return
325
 
326
  context = get_context(user_message)
 
 
 
 
 
 
 
 
 
 
327
  DBG("FULL CONTEXT")
328
  for x in context:
329
  DBG(x)
@@ -340,8 +360,21 @@ with gr.Blocks(theme=theme) as demo_blocks:
340
  context_str = "Context:\n"
341
  for i, x in enumerate(context): # note different after reranking
342
  DBG(x)
343
- context_str += "### " + str(i) + "\n" + x + "\n"
 
 
 
 
 
 
 
 
 
 
 
 
344
  ctx_text = context_str
 
345
  prompt = f"Context: {context_str}\nQuestion:{user_message}\n"
346
  else:
347
  ctx_text = "(no retrieved context used)"
@@ -375,6 +408,8 @@ with gr.Blocks(theme=theme) as demo_blocks:
375
  messages += history[:-1] # because the prompt has the context.
376
  ## Truncate the messages when too many?
377
  messages.append({"role": "user", "content": prompt}) ## should be ChatMessage
 
 
378
  # format_history(messages)
379
  # print("=" * 40)
380
  # print(messages)
@@ -420,6 +455,4 @@ with gr.Blocks(theme=theme) as demo_blocks:
420
  # demo.launch(share=True)
421
  if __name__ == "__main__":
422
  print("Starting")
423
- doc_store = InMemoryDocumentStore().load_from_disk("pufendorfdocs.store")
424
- print(f"Number of documents: {doc_store.count_documents()}.")
425
  demo_blocks.launch()
 
91
  print("ERROR, no db")
92
  collection = None
93
 
94
+ doc_store = InMemoryDocumentStore().load_from_disk("pufendorfdocs.store")
95
+ print(f"Number of documents: {doc_store.count_documents()}.")
96
+ hybrid_retrieval = create_hybrid_retriever(doc_store)
97
+
98
 
99
  # Contact OpenAI "moderator".
100
  def moderator(message):
 
133
  return data
134
 
135
 
136
+ # Hybrid retriever from hybrid, uses pufendorfstore.
137
+ def get_hybrid_context(message):
138
+ documents = retrieve(hybrid_retrieval, message, top_k=3, scale=True)
139
+ return documents
140
+
141
+
142
  def extract_persons(a_text) -> str:
143
  print(a_text)
144
  system_prompt = (
 
334
  return
335
 
336
  context = get_context(user_message)
337
+ hybrid_context = get_hybrid_context(user_message)
338
+ for hc in hybrid_context:
339
+ DBG(
340
+ str(hc.meta["file_path"])
341
+ + " "
342
+ + str(hc.meta["page_number"])
343
+ + "/"
344
+ + str(hc.content)
345
+ )
346
+
347
  DBG("FULL CONTEXT")
348
  for x in context:
349
  DBG(x)
 
360
  context_str = "Context:\n"
361
  for i, x in enumerate(context): # note different after reranking
362
  DBG(x)
363
+ context_str += x + "\n\n"
364
+ # The hc is the new haystack contents.
365
+ hybridkeep = os.getenv("HYBRIDKEEP")
366
+ if not hybridkeep:
367
+ hybridkeep = 3
368
+ else:
369
+ hybridkeep = int(hybridkeep)
370
+ DBG("hybrid context keep: " + str(hybridkeep))
371
+ if hybridkeep > 0:
372
+ hybrid_context = hybrid_context[0:hybridkeep]
373
+ for i, x in enumerate(hybrid_context):
374
+ DBG(x)
375
+ context_str += x.content + "\n\n"
376
  ctx_text = context_str
377
+ if ctxkeep > 0 or hybridkeep > 0:
378
  prompt = f"Context: {context_str}\nQuestion:{user_message}\n"
379
  else:
380
  ctx_text = "(no retrieved context used)"
 
408
  messages += history[:-1] # because the prompt has the context.
409
  ## Truncate the messages when too many?
410
  messages.append({"role": "user", "content": prompt}) ## should be ChatMessage
411
+ # ctx_text = str(messages)
412
+ # DBG(prompt)
413
  # format_history(messages)
414
  # print("=" * 40)
415
  # print(messages)
 
455
  # demo.launch(share=True)
456
  if __name__ == "__main__":
457
  print("Starting")
 
 
458
  demo_blocks.launch()
hybrid.py CHANGED
@@ -31,8 +31,7 @@ python hybrid.py -c newstore.store
31
  python hybrid.py -r newstore.store -q "who is pufendorf"
32
  """
33
 
34
- # embedding_model = "sentence-transformers/all-MiniLM-L6-v2"
35
- embedding_model = "sentence-transformers/all-MiniLM-L12-v2"
36
 
37
  # see https://huggingface.co/BAAI/bge-m3
38
  reranker_model = "BAAI/bge-reranker-base"
 
31
  python hybrid.py -r newstore.store -q "who is pufendorf"
32
  """
33
 
34
+ embedding_model = "sentence-transformers/all-MiniLM-L6-v2"
 
35
 
36
  # see https://huggingface.co/BAAI/bge-m3
37
  reranker_model = "BAAI/bge-reranker-base"
vector3_db/a1b2bf9f-4f30-46a6-a6c2-b6ca99effce9/data_level0.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b8146ecc3e4c3a36ea9b3edc3778630c452f483990ec942d38e8006f4661e430
3
  size 16760000
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a2f33a640a6a1c9930a46cf5a6b6ebc0e07c52d85a754892208b3725ec6d7964
3
  size 16760000
vector3_db/a1b2bf9f-4f30-46a6-a6c2-b6ca99effce9/length.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cc238a7a80a8cb9db9df824df6a3252ba0dd6f473223db345f2c4727a127151f
3
  size 40000
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:174a07871bf6956d282f718bf1af45ecb44ff58d1120450e65572884e2655044
3
  size 40000
vector3_db/chroma.sqlite3 CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:861003a15d8d7e1a50386e60541f0f36e1a5a431a3c76c10abbe6ac42cf8c560
3
  size 11452416
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2a4d40287f44cd70d89a2167703709b18734676b57e399607ebef5145003eda0
3
  size 11452416