Spaces:

Inferno-721
/

Sutra_AI

Sleeping

Sutra_AI / utils /pdf_utils.py

Initial

0753d2e 11 months ago

1.14 kB

	import PyPDF2
	from typing import List, Dict

	class PDFProcessor:
	def __init__(self):
	self.pages = {}

	def extract_text(self, pdf_file) -> Dict[int, str]:
	"""Extract text from PDF and return a dictionary of page numbers and text."""
	pdf_reader = PyPDF2.PdfReader(pdf_file)
	for page_num in range(len(pdf_reader.pages)):
	text = pdf_reader.pages[page_num].extract_text()
	self.pages[page_num] = text
	return self.pages

	def chunk_text(self, text: str, chunk_size: int = 1000) -> List[str]:
	"""Split text into chunks of specified size."""
	words = text.split()
	chunks = []
	current_chunk = []
	current_size = 0

	for word in words:
	current_size += len(word) + 1 # +1 for space
	if current_size > chunk_size:
	chunks.append(' '.join(current_chunk))
	current_chunk = [word]
	current_size = len(word)
	else:
	current_chunk.append(word)

	if current_chunk:
	chunks.append(' '.join(current_chunk))

	return chunks