Spaces:

hamzafar
/

document_chatbot

Sleeping

App Files Files Community

document_chatbot / app.py

hamzafar

Update app.py

89bcf10 verified over 1 year ago

raw

history blame

8.08 kB

	import os
	import shutil
	import subprocess

	import gradio as gr
	from langchain import HuggingFaceHub
	from langchain.chains import create_retrieval_chain
	from langchain.chains.combine_documents import create_stuff_documents_chain
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain_community.document_loaders import PyPDFLoader
	from langchain_community.embeddings import HuggingFaceEmbeddings
	from langchain_community.vectorstores import Chroma
	from langchain_core.prompts import ChatPromptTemplate

	# Get the directory of the current module
	module_directory = os.path.dirname(os.path.abspath(__file__))


	class DocumentProcessor:
	def __init__(self, document_paths, token):
	self.document_paths = document_paths

	os.environ["HUGGINGFACEHUB_API_TOKEN"] = os.getenv('hf_token')
	# self.load_documents([r"E:\FreeLance\download\tmp_doc.pdf"])
	self.persist_directory = './docs/chroma/'
	# self.vectordb = None

	self.llm = HuggingFaceHub(repo_id="tiiuae/falcon-7b-instruct",
	model_kwargs={"max_length": 300, "max_new_tokens": 300})
	# self.llm = HuggingFaceHub(repo_id="tiiuae/falcon-7b-instruct",
	# model_kwargs={"max_length": 1000, "max_new_tokens": 1000})

	# self.llm = HuggingFaceHub(repo_id="tiiuae/falcon-40b", # tiiuae/falcon-40b
	# model_kwargs={"max_length": 1000, "max_new_tokens": 1000})

	def load_documents(self, file_paths):
	# self.document_paths = [r"E:\FreeLance\download\tmp_doc.pdf"]
	self.document_paths = file_paths
	# Load PDF
	loaders = [
	# Duplicate documents on purpose - messy data
	PyPDFLoader(self.document_paths[0])
	# PyPDFLoader("/content/documents/ddpm.pdf"),
	]
	self.docs = []
	for loader in loaders:
	self.docs.extend(loader.load())

	def split_documents(self):
	text_splitter = RecursiveCharacterTextSplitter(chunk_size=200, chunk_overlap=15)
	# text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=150)
	self.splits = text_splitter.split_documents(self.docs)

	def change_permissions(self, directory):
	try:
	# Define the command
	command = ["chmod", "777", "-R", directory]

	# Execute the command
	subprocess.run(command, check=True)

	print(f"Permissions for {directory} changed to 664 successfully.")
	except subprocess.CalledProcessError as e:
	print(f"An error occurred while changing permissions: {e}")

	def delete_embeddings(self):
	if os.path.isdir(self.persist_directory):
	self.change_permissions(self.persist_directory)
	print('directory exist')
	shutil.rmtree(self.persist_directory, ignore_errors=True)

	def create_embeddings(self):
	embeddings = HuggingFaceEmbeddings()
	self.vectordb_doc = Chroma.from_documents(
	documents=self.splits,
	embedding=embeddings,
	persist_directory=self.persist_directory
	)
	print(self.vectordb_doc._collection.count())
	# self.vectordb = vectordb


	def get_embeddings(self):
	return self.vectordb_doc


	def parse_output(self, response):
	# Find the index where "Question:" starts
	question_index = response.find("Question:")
	# Get all text including and after "Question:"
	if question_index != -1:
	result_text = response[question_index:].strip()
	return result_text
	else:
	return "I apologies, I don't know the answer"

	def document_chain(self):
	# prompt = ChatPromptTemplate.from_template("""Answer the following question based only on the provided context:
	#
	# <context>
	# {context}
	# </context>
	#
	# Question: {input}""")

	prompt = ChatPromptTemplate.from_template("""
	Answer the following question based only on the provided context: {context}

	Question: {input}
	""")

	document_chain = create_stuff_documents_chain(self.llm, prompt)
	return document_chain

	def reterival_chain(self, document_chain, document_embeddings):
	retriever = document_embeddings.as_retriever()
	retrieval_chain = create_retrieval_chain(retriever, document_chain)
	return retrieval_chain

	def get_response(self, retrieval_chain, message):
	# response = retrieval_chain.invoke({"input": "how can langsmith help with testing?"})
	response = retrieval_chain.invoke({"input": message})
	# print(response["answer"])
	return response["answer"]


	print('')


	def upload_file(files, processor):
	try:
	file_paths = [file.name for file in files]

	processor.load_documents(file_paths)
	processor.split_documents()
	processor.delete_embeddings()
	doc_embeddings = processor.create_embeddings()
	gr.Info("Document Uploaded,Enjoy Chat Now!")
	except Exception as e:
	# Handle any exceptions that occur during execution
	print(f"An error occurred: {e}")
	gr.Warning("Upload File(s) Again!")
	# return doc_embeddings
	# print(file_paths)


	def echo(message, history, processor):
	try:
	document_chain = processor.document_chain()
	document_embeddings = processor.get_embeddings()
	reterival_chain = processor.reterival_chain(document_chain, document_embeddings)

	chain_result = processor.get_response(reterival_chain, message)
	parsed_result = processor.parse_output(chain_result)
	return parsed_result

	except Exception as e:
	# Handle any exceptions that occur during execution
	print(f"An error occurred: {e}")
	gr.Warning("An Error Occurred, Refresh Website!")

	def upload_warning():
	gr.Warning("Upload PDF File(s) First!")


	def main():
	css = """
	.container {
	height: 90vh;
	}

	.container_1 {
	height: 80vh;
	}

	.container_2 {
	height: 20vh;
	}
	"""

	processor = DocumentProcessor(document_paths='', token='')
	with gr.Blocks(css=css) as demo:
	demo.load(upload_warning, inputs=None, outputs=None)
	with gr.Column(elem_classes=["container"]):
	gr.Markdown("## Chat with your Data")
	with gr.Column(elem_classes=["container_2"]):
	# gr.Markdown("Make sure uploading PDF file(s) first!")
	file_output = gr.File()
	upload_button = gr.UploadButton("Click to Upload File(s)", file_types=["pdf", "doc"],
	file_count="multiple")

	# Function to handle the upload and pass the processor
	def process_upload(files):
	upload_file(files, processor)

	# Get the document embeddings returned by process_upload
	upload_button.upload(process_upload, upload_button, file_output)

	with gr.Column(elem_classes=["container_1"]):
	def process_echo(message, history):
	return echo(message, history, processor)

	gr.ChatInterface(fn=process_echo, examples=["what is title", "what is summary", "create notes"])

	gr.Markdown("* Note: The answers can be incorrect, However they can be enhanced")


	# with gr.Blocks() as demo:
	# gr.ChatInterface(fn=echo, examples=["what is title", "what is summary", "merhaba"], title="chat with your data")
	## file_output = gr.File()
	# upload_button = gr.UploadButton("Click to Upload a File", file_types=["pdf", "doc"], file_count="multiple")
	# upload_button.upload(upload_file, upload_button, file_output)

	# demo = gr.ChatInterface(fn=echo, examples=["what is title", "what is summary", "merhaba"], title="Echo Bot")
	demo.launch()


	if __name__ == "__main__":

	main()