|
|
import datetime |
|
|
import json |
|
|
import shutil |
|
|
from ingest_manifest import ingest_manifest |
|
|
from chunks_and_metadata import convert_md_to_chunks |
|
|
from document_parser import convert_doc_to_md |
|
|
from bm25_index import load_chunks, build_bm25_index, save_index, load_index |
|
|
from embedding_index import main as build_embedding_index_main |
|
|
import pathlib, re |
|
|
import logging |
|
|
|
|
|
logging.basicConfig(level=logging.INFO) |
|
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
BASE = pathlib.Path(__file__).resolve().parent.parent |
|
|
CONVERTED = BASE / "converted" |
|
|
CHUNKS_DIR = BASE / "chunks" |
|
|
CHUNKS_DIR.mkdir(parents=True, exist_ok=True) |
|
|
INDEX_OUT = BASE / "bm25_index.pkl" |
|
|
|
|
|
MAX_TOKENS = 512 |
|
|
OVERLAP_TOKENS = 50 |
|
|
BASE = pathlib.Path(__file__).resolve().parent.parent |
|
|
RAW = BASE / "raw_docs" |
|
|
|
|
|
|
|
|
|
|
|
def ingest_manifest_step(): |
|
|
ingest_manifest() |
|
|
|
|
|
|
|
|
def convert_docs_to_markdown_step(): |
|
|
for doc in RAW.iterdir(): |
|
|
if doc.suffix.lower() not in [".docx"]: |
|
|
logger.info("Skipping:", doc); continue |
|
|
out = CONVERTED / (doc.stem + ".md") |
|
|
convert_doc_to_md(doc, out) |
|
|
logger.info("Converted:", out) |
|
|
|
|
|
|
|
|
def convert_md_to_chunks_step(): |
|
|
manifests = [] |
|
|
for md in CONVERTED.iterdir(): |
|
|
m = convert_md_to_chunks(md, CHUNKS_DIR) |
|
|
manifests.extend(m) |
|
|
with open(CHUNKS_DIR / "chunks_manifest.json", "w", encoding="utf-8") as f: |
|
|
json.dump({"generated_at": datetime.datetime.utcnow().isoformat()+"Z", "chunks": manifests}, f, ensure_ascii=False, indent=2) |
|
|
logger.info("Wrote", len(manifests), "chunks") |
|
|
|
|
|
|
|
|
def build_bm25_index_step(): |
|
|
|
|
|
if INDEX_OUT.exists(): |
|
|
INDEX_OUT.unlink() |
|
|
chunks = load_chunks(CHUNKS_DIR) |
|
|
bm25_index = build_bm25_index(chunks) |
|
|
save_index(bm25_index, INDEX_OUT) |
|
|
logger.info("Built bm25 index and saved to %s", INDEX_OUT) |
|
|
|
|
|
def build_embedding_index_step(): |
|
|
import os |
|
|
current_dir = os.path.dirname(os.path.abspath(__file__)) |
|
|
parent_dir = os.path.dirname(current_dir) |
|
|
|
|
|
if os.path.exists(os.path.join(parent_dir, "chroma_db")): |
|
|
shutil.rmtree(os.path.join(parent_dir, "chroma_db")) |
|
|
build_embedding_index_main( |
|
|
chunks_dir="chunks", |
|
|
persist_dir= os.path.join(parent_dir, "chroma_db"), |
|
|
collection="snote", |
|
|
model_name="AITeamVN/Vietnamese_Embedding_v2", |
|
|
batch_size=100, |
|
|
device="cpu", |
|
|
force_reembed=True |
|
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
build_embedding_index_step() |