"""Contains a function to split text into smaller chunks.""" from typing import List from langchain_core.documents import Document from langchain_text_splitters import RecursiveCharacterTextSplitter import sys from pprint import pprint # pprint(sys.path) from llm_system.config import DOC_CHAR_LIMIT, DOC_OVERLAP_NO from logger import get_logger log = get_logger(name="utils_splitter") def split_text( documents: List[Document], chunk_size: int = DOC_CHAR_LIMIT, chunk_overlap: int = DOC_OVERLAP_NO ) -> tuple[bool, List[Document], str]: """Splits a list of Document objects into smaller chunks. Args: documents (List[Document]): List of Document objects to be split. chunk_size (int): The maximum size of each chunk. chunk_overlap (int): The number of characters that overlap between chunks. Returns: tuple[bool, List[Document], str]: A tuple containing: - bool: True if the documents were split successfully, False otherwise. - List[Document]: A list of Document objects containing the split text. - str: Message indicating the result of the splitting operation. """ try: log.info(f"🔨 split_text() starting - input: {len(documents)} documents, chunk_size={chunk_size}, overlap={chunk_overlap}") text_splitter = RecursiveCharacterTextSplitter( chunk_size=chunk_size, chunk_overlap=chunk_overlap ) log.info(f"⏳ Executing text_splitter.split_documents()...") split_docs = text_splitter.split_documents(documents) log.info(f"✅ split_documents() completed, got {len(split_docs)} chunks from {len(documents)} documents") if not split_docs: log.warning("⚠️ No documents were split. Please check the input documents.") return True, [], "No documents were split. Please check the input documents." log.info(f"✅ Successfully split {len(documents)} documents into {len(split_docs)} chunks. Total content: {sum(len(d.page_content) for d in split_docs)} chars") return True, split_docs, "Documents split successfully." except Exception as e: log.error(f"❌ Error splitting documents: {e}") import traceback log.error(f"Traceback: {traceback.format_exc()}") return False, [], f"Error splitting documents: {e}" if __name__ == "__main__": # Example usage example_docs = [ Document(page_content="This is a sample document. " * 10), Document(page_content="Another document with some text. " * 5), Document(page_content="Yet another document with different content. " * 3) ] status, split_documents, message = split_text(example_docs, chunk_size=100, chunk_overlap=10) for i, doc in enumerate(split_documents): print(f"Chunk {i+1}: {doc.page_content}") # Print first 50 characters of each chunk # print(f"Chunk {i+1}: {doc.page_content[:50]}...")