File size: 2,770 Bytes
44c5827
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import datetime
import json
import shutil
from ingest_manifest import ingest_manifest
from chunks_and_metadata import convert_md_to_chunks
from document_parser import convert_doc_to_md
from bm25_index import load_chunks, build_bm25_index, save_index, load_index
from embedding_index import main as build_embedding_index_main
import pathlib, re
import logging

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

BASE = pathlib.Path(__file__).resolve().parent.parent
CONVERTED = BASE / "converted"
CHUNKS_DIR = BASE / "chunks"   #temp fix
CHUNKS_DIR.mkdir(parents=True, exist_ok=True)
INDEX_OUT = BASE / "bm25_index.pkl"

MAX_TOKENS = 512
OVERLAP_TOKENS = 50
BASE = pathlib.Path(__file__).resolve().parent.parent
RAW = BASE / "raw_docs"


# step 1: ingest docs to raw_docs
def ingest_manifest_step():
    ingest_manifest()

# step 2: convert docs to markdown
def convert_docs_to_markdown_step():
    for doc in RAW.iterdir():
        if doc.suffix.lower() not in [".docx"]:
            logger.info("Skipping:", doc); continue
        out = CONVERTED / (doc.stem + ".md")
        convert_doc_to_md(doc, out)
        logger.info("Converted:", out)

# step 3: process md to chunks
def convert_md_to_chunks_step():
    manifests = []
    for md in CONVERTED.iterdir():
        m = convert_md_to_chunks(md, CHUNKS_DIR)
        manifests.extend(m)
    with open(CHUNKS_DIR / "chunks_manifest.json", "w", encoding="utf-8") as f:
        json.dump({"generated_at": datetime.datetime.utcnow().isoformat()+"Z", "chunks": manifests}, f, ensure_ascii=False, indent=2)
    logger.info("Wrote", len(manifests), "chunks")

# step 4, 5: build bm25 index and embedding index
def build_bm25_index_step():
    # delete existing bm25 index
    if INDEX_OUT.exists():
        INDEX_OUT.unlink()
    chunks = load_chunks(CHUNKS_DIR)
    bm25_index = build_bm25_index(chunks)
    save_index(bm25_index, INDEX_OUT)
    logger.info("Built bm25 index and saved to %s", INDEX_OUT)

def build_embedding_index_step():
    import os
    current_dir = os.path.dirname(os.path.abspath(__file__))
    parent_dir = os.path.dirname(current_dir)
    # delete existing embedding index
    if os.path.exists(os.path.join(parent_dir, "chroma_db")):
        shutil.rmtree(os.path.join(parent_dir, "chroma_db"))
    build_embedding_index_main(
        chunks_dir="chunks",
        persist_dir= os.path.join(parent_dir, "chroma_db"),
        collection="snote",
        model_name="AITeamVN/Vietnamese_Embedding_v2",
        batch_size=100,
        device="cpu",
        force_reembed=True
    )

if __name__ == "__main__":
    # ingest_manifest_step()
    # convert_docs_to_markdown_step()
    # convert_md_to_chunks_step()
    # build_bm25_index_step()
    build_embedding_index_step()