simple-rag-qa / pdf_cache.py
Matvii Hotovych
Added pdf data retrieving
6a7af6b
raw
history blame contribute delete
829 Bytes
import os
import json
CACHE_DIR = "data/processed"
META_FILE = os.path.join(CACHE_DIR, "meta.json")
def ensure_cache_dir():
os.makedirs(CACHE_DIR, exist_ok=True)
def load_metadata():
if os.path.exists(META_FILE):
with open(META_FILE, "r", encoding="utf-8") as f:
return json.load(f)
return {}
def save_metadata(meta):
with open(META_FILE, "w", encoding="utf-8") as f:
json.dump(meta, f, indent=2)
def save_text(doc_id, text):
path = os.path.join(CACHE_DIR, f"doc_{doc_id}.txt")
with open(path, "w", encoding="utf-8") as f:
f.write(text)
return path
def load_text(doc_id):
path = os.path.join(CACHE_DIR, f"doc_{doc_id}.txt")
if not os.path.exists(path):
return None
with open(path, "r", encoding="utf-8") as f:
return f.read()