import argparse, os, json from pathlib import Path from dotenv import dotenv_values import pandas as pd from app.ingest import ingest from app.search import search def get_env(): """ Load environment paths. On Hugging Face Spaces, fall back to local 'data/' dirs. Never use absolute paths like /Users/... which don't exist in the container. """ env = dotenv_values(".env") or {} # Root of the repo (parent of this file's folder) ROOT = Path(__file__).resolve().parents[1] def default_path(sub): return str(ROOT / sub) env.setdefault("DATA_DIR", default_path("data")) env.setdefault("DOCSTORE_DIR", default_path("data/docstore")) env.setdefault("INDEX_DIR", default_path("data/index")) env.setdefault("EXPORT_DIR", default_path("data/exports")) for k in ["DATA_DIR", "DOCSTORE_DIR", "INDEX_DIR", "EXPORT_DIR"]: os.makedirs(env[k], exist_ok=True) return env def ensure_index_exists(env: dict): """ Ensure a FAISS index exists in env['INDEX_DIR']. If missing, run a minimal ingest using config/sources.yaml. This lets the Hugging Face Space self-heal on first boot. """ index_dir = Path(env["INDEX_DIR"]) faiss_idx = index_dir / "faiss.index" meta_json = index_dir / "meta.json" if faiss_idx.exists() and meta_json.exists(): return # already built print("Index not found. Building now via ingest() …") # NOTE: This uses your existing ingestion pipeline. # If your ingest relies on API keys, set them in the Space's # Settings → Variables and secrets, then Restart the Space. path, n = ingest("config/sources.yaml", env) print(f"Ingest complete. {n} records. Docstore: {path}") def cmd_ingest(args): env = get_env() path, n = ingest("config/sources.yaml", env) print(f"Ingest complete. {n} records. Docstore: {path}") def cmd_search(args): env = get_env() ensure_index_exists(env) # <— NEW: auto-build if missing filters = {} if args.geo: filters["geo"] = args.geo.split(",") if args.categories: filters["categories"] = args.categories.split(",") res = search(args.q, env, top_k=args.k, filters=filters) for r in res: print(f"- {r['title']} [{r['source']}] ({r['geo']}) score={r['score']:.3f}") print(f" {r['url']}") def cmd_export(args): env = get_env() ensure_index_exists(env) # <— NEW: auto-build if missing filters = {} if args.geo: filters["geo"] = args.geo.split(",") if args.categories: filters["categories"] = args.categories.split(",") res = search(args.q, env, top_k=args.k, filters=filters) out = Path(env["EXPORT_DIR"]) / (args.out or "results.csv") pd.DataFrame(res).to_csv(out, index=False) print(f"Exported {len(res)} rows to {out}") if __name__ == "__main__": p = argparse.ArgumentParser() sub = p.add_subparsers(dest="cmd") p_ing = sub.add_parser("ingest", help="Ingest sources and build index") p_ing.set_defaults(func=cmd_ingest) p_search = sub.add_parser("search", help="Search index") p_search.add_argument("--q", required=True) p_search.add_argument("--k", type=int, default=15) p_search.add_argument("--geo", default="") p_search.add_argument("--categories", default="") p_search.set_defaults(func=cmd_search) p_export = sub.add_parser("export", help="Export search results to CSV") p_export.add_argument("--q", required=True) p_export.add_argument("--k", type=int, default=50) p_export.add_argument("--geo", default="") p_export.add_argument("--categories", default="") p_export.add_argument("--out", default="results.csv") p_export.set_defaults(func=cmd_export) args = p.parse_args() if not args.cmd: p.print_help() else: args.func(args)