snote / scripts /ingest_manifest.py
xuanbao01's picture
Upload folder using huggingface_hub
44c5827 verified
#!/usr/bin/env python3
import os, hashlib, json, shutil, datetime
BASE = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
INCOMING = os.path.join(BASE, "incoming")
RAW = os.path.join(BASE, "raw_docs")
MANIFEST = os.path.join(BASE, "manifest.json")
os.makedirs(INCOMING, exist_ok=True)
os.makedirs(RAW, exist_ok=True)
def sha256_of_file(path, block_size=65536):
h = hashlib.sha256()
with open(path, "rb") as f:
for block in iter(lambda: f.read(block_size), b""):
h.update(block)
return h.hexdigest()
def ingest_manifest():
manifest = {"generated_at": datetime.datetime.utcnow().isoformat() + "Z", "documents": []}
for fname in sorted(os.listdir(INCOMING)):
src = os.path.join(INCOMING, fname)
if not os.path.isfile(src):
continue
checksum = sha256_of_file(src)
size = os.path.getsize(src)
mtime = datetime.datetime.utcfromtimestamp(os.path.getmtime(src)).isoformat() + "Z"
dst = os.path.join(RAW, fname)
if not os.path.exists(dst):
shutil.copy2(src, dst)
manifest["documents"].append({"filename": fname, "checksum_sha256": checksum, "size_bytes": size, "mtime_utc": mtime, "stored_path": os.path.relpath(dst, BASE)})
with open(MANIFEST, "w", encoding="utf-8") as f:
json.dump(manifest, f, ensure_ascii=False, indent=2)
print("Wrote manifest:", MANIFEST)
if __name__ == "__main__":
ingest_manifest()