import datetime import json import shutil from ingest_manifest import ingest_manifest from chunks_and_metadata import convert_md_to_chunks from document_parser import convert_doc_to_md from bm25_index import load_chunks, build_bm25_index, save_index, load_index from embedding_index import main as build_embedding_index_main import pathlib, re import logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) BASE = pathlib.Path(__file__).resolve().parent.parent CONVERTED = BASE / "converted" CHUNKS_DIR = BASE / "chunks" #temp fix CHUNKS_DIR.mkdir(parents=True, exist_ok=True) INDEX_OUT = BASE / "bm25_index.pkl" MAX_TOKENS = 512 OVERLAP_TOKENS = 50 BASE = pathlib.Path(__file__).resolve().parent.parent RAW = BASE / "raw_docs" # step 1: ingest docs to raw_docs def ingest_manifest_step(): ingest_manifest() # step 2: convert docs to markdown def convert_docs_to_markdown_step(): for doc in RAW.iterdir(): if doc.suffix.lower() not in [".docx"]: logger.info("Skipping:", doc); continue out = CONVERTED / (doc.stem + ".md") convert_doc_to_md(doc, out) logger.info("Converted:", out) # step 3: process md to chunks def convert_md_to_chunks_step(): manifests = [] for md in CONVERTED.iterdir(): m = convert_md_to_chunks(md, CHUNKS_DIR) manifests.extend(m) with open(CHUNKS_DIR / "chunks_manifest.json", "w", encoding="utf-8") as f: json.dump({"generated_at": datetime.datetime.utcnow().isoformat()+"Z", "chunks": manifests}, f, ensure_ascii=False, indent=2) logger.info("Wrote", len(manifests), "chunks") # step 4, 5: build bm25 index and embedding index def build_bm25_index_step(): # delete existing bm25 index if INDEX_OUT.exists(): INDEX_OUT.unlink() chunks = load_chunks(CHUNKS_DIR) bm25_index = build_bm25_index(chunks) save_index(bm25_index, INDEX_OUT) logger.info("Built bm25 index and saved to %s", INDEX_OUT) def build_embedding_index_step(): import os current_dir = os.path.dirname(os.path.abspath(__file__)) parent_dir = os.path.dirname(current_dir) # delete existing embedding index if os.path.exists(os.path.join(parent_dir, "chroma_db")): shutil.rmtree(os.path.join(parent_dir, "chroma_db")) build_embedding_index_main( chunks_dir="chunks", persist_dir= os.path.join(parent_dir, "chroma_db"), collection="snote", model_name="AITeamVN/Vietnamese_Embedding_v2", batch_size=100, device="cpu", force_reembed=True ) if __name__ == "__main__": # ingest_manifest_step() # convert_docs_to_markdown_step() # convert_md_to_chunks_step() # build_bm25_index_step() build_embedding_index_step()