Spaces:
Build error
Build error
| from src.helper import load_pdf, text_split, download_hugging_face_embeddings | |
| import os | |
| from pinecone import Pinecone, ServerlessSpec | |
| # Set your Pinecone API key and environment directly in the script | |
| PINECONE_API_KEY = os.getenv("PINECONE_API_KEY", "cc") | |
| PINECONE_API_ENV = os.getenv("PINECONE_API_ENV", "us-east-1") | |
| # Initialize Pinecone | |
| pc = Pinecone(api_key=PINECONE_API_KEY) | |
| # Check if the index exists, if not create it | |
| index_name = "bhagavadgita" | |
| if index_name not in pc.list_indexes().names(): | |
| pc.create_index( | |
| name=index_name, | |
| dimension=384, # Replace with the actual dimension of your embeddings | |
| metric='euclidean', | |
| spec=ServerlessSpec( | |
| cloud='aws', | |
| region=PINECONE_API_ENV | |
| ) | |
| ) | |
| # Connect to the index | |
| index = pc.Index(index_name) | |
| # Load PDF and split text | |
| extracted_data = load_pdf("data/") | |
| text_chunks = text_split(extracted_data) | |
| embeddings = download_hugging_face_embeddings() | |
| # Use the correct method to obtain embeddings | |
| vectors = embeddings.embed_documents([t.page_content for t in text_chunks]) | |
| ids = [f"doc_{i}" for i in range(len(text_chunks))] | |
| # Split vectors into smaller batches | |
| batch_size = 1000 # Adjust batch size as needed | |
| for i in range(0, len(vectors), batch_size): | |
| batch_ids = ids[i:i + batch_size] | |
| batch_vectors = vectors[i:i + batch_size] | |
| # Upsert vectors into Pinecone index | |
| index.upsert(vectors=list(zip(batch_ids, batch_vectors))) | |
| print("Indexing completed.") | |