|
|
|
|
|
import os, hashlib, json, shutil, datetime |
|
|
BASE = os.path.abspath(os.path.join(os.path.dirname(__file__), "..")) |
|
|
INCOMING = os.path.join(BASE, "incoming") |
|
|
RAW = os.path.join(BASE, "raw_docs") |
|
|
MANIFEST = os.path.join(BASE, "manifest.json") |
|
|
os.makedirs(INCOMING, exist_ok=True) |
|
|
os.makedirs(RAW, exist_ok=True) |
|
|
def sha256_of_file(path, block_size=65536): |
|
|
h = hashlib.sha256() |
|
|
with open(path, "rb") as f: |
|
|
for block in iter(lambda: f.read(block_size), b""): |
|
|
h.update(block) |
|
|
return h.hexdigest() |
|
|
|
|
|
def ingest_manifest(): |
|
|
manifest = {"generated_at": datetime.datetime.utcnow().isoformat() + "Z", "documents": []} |
|
|
for fname in sorted(os.listdir(INCOMING)): |
|
|
src = os.path.join(INCOMING, fname) |
|
|
if not os.path.isfile(src): |
|
|
continue |
|
|
checksum = sha256_of_file(src) |
|
|
size = os.path.getsize(src) |
|
|
mtime = datetime.datetime.utcfromtimestamp(os.path.getmtime(src)).isoformat() + "Z" |
|
|
dst = os.path.join(RAW, fname) |
|
|
if not os.path.exists(dst): |
|
|
shutil.copy2(src, dst) |
|
|
manifest["documents"].append({"filename": fname, "checksum_sha256": checksum, "size_bytes": size, "mtime_utc": mtime, "stored_path": os.path.relpath(dst, BASE)}) |
|
|
with open(MANIFEST, "w", encoding="utf-8") as f: |
|
|
json.dump(manifest, f, ensure_ascii=False, indent=2) |
|
|
print("Wrote manifest:", MANIFEST) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
ingest_manifest() |