|
|
import os |
|
|
os.environ["TOKENIZERS_PARALLELISM"] = "false" |
|
|
from typing import List |
|
|
|
|
|
|
|
|
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline |
|
|
from langchain.llms import HuggingFacePipeline |
|
|
|
|
|
|
|
|
from langchain_community.document_loaders import OnlinePDFLoader |
|
|
from langchain.text_splitter import RecursiveCharacterTextSplitter |
|
|
from langchain_community.vectorstores import FAISS |
|
|
|
|
|
from langchain.embeddings import HuggingFaceEmbeddings |
|
|
from langchain_community.retrievers import BM25Retriever |
|
|
from langchain.retrievers import EnsembleRetriever |
|
|
from langchain.prompts import ChatPromptTemplate |
|
|
from langchain.schema import StrOutputParser |
|
|
from langchain_core.runnables import RunnableParallel, RunnablePassthrough |
|
|
|
|
|
def create_deepseek_pipeline() -> HuggingFacePipeline: |
|
|
""" |
|
|
Create a HuggingFace pipeline using the DeepSeek-R1 model and wrap it as a LangChain LLM. |
|
|
""" |
|
|
|
|
|
model = AutoModelForCausalLM.from_pretrained( |
|
|
"deepseek-ai/DeepSeek-R1", |
|
|
trust_remote_code=True |
|
|
) |
|
|
tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-R1") |
|
|
|
|
|
|
|
|
|
|
|
pipe = pipeline( |
|
|
"text-generation", |
|
|
model=model, |
|
|
tokenizer=tokenizer, |
|
|
trust_remote_code=True, |
|
|
max_length=2048, |
|
|
do_sample=True, |
|
|
temperature=0.5, |
|
|
top_p=1 |
|
|
) |
|
|
|
|
|
|
|
|
return HuggingFacePipeline(pipeline=pipe) |
|
|
|
|
|
class ElevatedRagChain: |
|
|
""" |
|
|
ElevatedRagChain integrates various components from LangChain to build an advanced |
|
|
retrieval-augmented generation (RAG) system. It processes PDF documents by loading, |
|
|
chunking, embedding, and adding their embeddings to a FAISS vector store for efficient |
|
|
retrieval. It then uses an ensemble retriever (BM25 + FAISS) and a DeepSeek model (via a |
|
|
HuggingFace pipeline) for generating detailed technical answers. |
|
|
""" |
|
|
def __init__(self) -> None: |
|
|
""" |
|
|
Initialize the class with a predefined embedding function, weights, and top_k value. |
|
|
""" |
|
|
|
|
|
self.embed_func = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2") |
|
|
self.bm25_weight = 0.6 |
|
|
self.faiss_weight = 0.4 |
|
|
self.top_k = 5 |
|
|
|
|
|
def add_pdfs_to_vectore_store( |
|
|
self, |
|
|
pdf_links: List, |
|
|
chunk_size: int = 1500, |
|
|
) -> None: |
|
|
""" |
|
|
Processes PDF documents by loading, chunking, embedding, and adding them to a FAISS vector store. |
|
|
|
|
|
Args: |
|
|
pdf_links (List): List of URLs pointing to the PDF documents to be processed. |
|
|
chunk_size (int, optional): Size of text chunks to split the documents into (default: 1500). |
|
|
""" |
|
|
|
|
|
self.raw_data = [OnlinePDFLoader(doc).load()[0] for doc in pdf_links] |
|
|
|
|
|
|
|
|
self.text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=100) |
|
|
self.split_data = self.text_splitter.split_documents(self.raw_data) |
|
|
|
|
|
|
|
|
self.bm25_retriever = BM25Retriever.from_documents(self.split_data) |
|
|
self.bm25_retriever.k = self.top_k |
|
|
|
|
|
|
|
|
self.vector_store = FAISS.from_documents(self.split_data, self.embed_func) |
|
|
self.faiss_retriever = self.vector_store.as_retriever(search_kwargs={"k": self.top_k}) |
|
|
print("All PDFs processed and added to vector store.") |
|
|
|
|
|
|
|
|
self.build_elevated_rag_system() |
|
|
print("RAG system is built successfully.") |
|
|
|
|
|
def build_elevated_rag_system(self) -> None: |
|
|
""" |
|
|
Build an advanced RAG system by combining: |
|
|
- BM25 retriever |
|
|
- FAISS vector store retriever |
|
|
- A DeepSeek model (via a HuggingFace pipeline) |
|
|
|
|
|
Note: The retrieval is performed using an ensemble of BM25 and FAISS retrievers |
|
|
without applying any additional reranking. |
|
|
""" |
|
|
|
|
|
self.ensemble_retriever = EnsembleRetriever( |
|
|
retrievers=[self.bm25_retriever, self.faiss_retriever], |
|
|
weights=[self.bm25_weight, self.faiss_weight] |
|
|
) |
|
|
|
|
|
|
|
|
RAG_PROMPT_TEMPLATE = """\ |
|
|
Use the following context to provide a detailed technical answer to the user's question. |
|
|
Do not include an introduction like "Based on the provided documents, ...". Just answer the question. |
|
|
If you don't know the answer, please respond with "I don't know". |
|
|
|
|
|
Context: |
|
|
{context} |
|
|
|
|
|
User's question: |
|
|
{question} |
|
|
""" |
|
|
self.rag_prompt = ChatPromptTemplate.from_template(RAG_PROMPT_TEMPLATE) |
|
|
self.str_output_parser = StrOutputParser() |
|
|
|
|
|
|
|
|
|
|
|
self.entry_point_and_elevated_retriever = RunnableParallel( |
|
|
{ |
|
|
"context": self.ensemble_retriever, |
|
|
"question": RunnablePassthrough() |
|
|
} |
|
|
) |
|
|
|
|
|
|
|
|
self.llm = create_deepseek_pipeline() |
|
|
|
|
|
|
|
|
|
|
|
self.elevated_rag_chain = self.entry_point_and_elevated_retriever | self.rag_prompt | self.llm |
|
|
|