Spaces:
Sleeping
Sleeping
| import PyPDF2 | |
| from typing import List, Dict | |
| class PDFProcessor: | |
| def __init__(self): | |
| self.pages = {} | |
| def extract_text(self, pdf_file) -> Dict[int, str]: | |
| """Extract text from PDF and return a dictionary of page numbers and text.""" | |
| pdf_reader = PyPDF2.PdfReader(pdf_file) | |
| for page_num in range(len(pdf_reader.pages)): | |
| text = pdf_reader.pages[page_num].extract_text() | |
| self.pages[page_num] = text | |
| return self.pages | |
| def chunk_text(self, text: str, chunk_size: int = 1000) -> List[str]: | |
| """Split text into chunks of specified size.""" | |
| words = text.split() | |
| chunks = [] | |
| current_chunk = [] | |
| current_size = 0 | |
| for word in words: | |
| current_size += len(word) + 1 # +1 for space | |
| if current_size > chunk_size: | |
| chunks.append(' '.join(current_chunk)) | |
| current_chunk = [word] | |
| current_size = len(word) | |
| else: | |
| current_chunk.append(word) | |
| if current_chunk: | |
| chunks.append(' '.join(current_chunk)) | |
| return chunks | |