Spaces:
Sleeping
Sleeping
| # app.py | |
| # ============================================================ | |
| # Hugging Face Docker Space (Gradio) - Hotel Cancellation Project | |
| # 3 Tabs: | |
| # 1) Run Pipeline + Execution Logs | |
| # 2) Results + Visualizations (Python + R) | |
| # 3) Predict Cancellation Probability (Python RF + R LASSO) | |
| # | |
| # Repo must contain: | |
| # booking.csv | |
| # 1_Data_Creation.ipynb | |
| # 2_Python_Analysis.ipynb | |
| # 3_R_Analysis.ipynb | |
| # requirements.txt | |
| # Dockerfile (installs R + IRkernel + needed R packages) | |
| # | |
| # Generated by notebooks: | |
| # hotel_cancel_model_dataset.csv, features.json, dataset_meta.json, train.csv, test.csv | |
| # artifacts/py/... and artifacts/r/... | |
| # ============================================================ | |
| import sys | |
| import traceback | |
| import json | |
| import os | |
| import subprocess | |
| from pathlib import Path | |
| from typing import Dict, Any, Tuple, Optional | |
| import pandas as pd | |
| import gradio as gr | |
| import joblib | |
| # ============================================================ | |
| # 0) Config (YOUR notebook filenames) | |
| # ============================================================ | |
| BASE_DIR = Path.cwd() | |
| DATA_NOTEBOOK = "1_Data_Creation.ipynb" | |
| PY_NOTEBOOK = "2_Python_Analysis.ipynb" | |
| R_NOTEBOOK = "3_R_Analysis.ipynb" | |
| RUNS_DIR = BASE_DIR / "runs" | |
| RUNS_DIR.mkdir(exist_ok=True) | |
| DATASET_PATH = BASE_DIR / "hotel_cancel_model_dataset.csv" | |
| FEATURES_PATH = BASE_DIR / "features.json" | |
| PY_MODEL_PATH = BASE_DIR / "artifacts" / "py" / "models" / "model.joblib" | |
| R_MODEL_PATH = BASE_DIR / "artifacts" / "r" / "models" / "model.rds" | |
| R_METRICS_PATH = BASE_DIR / "artifacts" / "r" / "metrics" / "metrics.json" | |
| # ============================================================ | |
| # 1) Notebook execution helpers | |
| # ============================================================ | |
| def _run_notebook(nb_name: str, out_name: str) -> str: | |
| """ | |
| Execute a notebook using papermill and return a log string. | |
| """ | |
| nb_in = BASE_DIR / nb_name | |
| nb_out = RUNS_DIR / out_name | |
| if not nb_in.exists(): | |
| return f"❌ Notebook not found: {nb_in}\nCheck the filename in app.py." | |
| # Choose kernel | |
| # - Python notebooks: python3 | |
| # - R notebook: ir (installed via IRkernel in Dockerfile) | |
| kernel = "python3" | |
| if nb_name == R_NOTEBOOK: | |
| kernel = os.environ.get("R_KERNEL_NAME", "ir") | |
| cmd = ["papermill", str(nb_in), str(nb_out), "-k", kernel] | |
| try: | |
| proc = subprocess.run(cmd, capture_output=True, text=True, check=False) | |
| parts = [] | |
| parts.append(f"▶ Running: {nb_name}") | |
| parts.append(f"▶ Kernel : {kernel}") | |
| parts.append(f"▶ Output : {nb_out.name}") | |
| parts.append("") | |
| if proc.stdout: | |
| parts.append("----- STDOUT -----") | |
| parts.append(proc.stdout) | |
| if proc.stderr: | |
| parts.append("----- STDERR -----") | |
| parts.append(proc.stderr) | |
| parts.append("") | |
| parts.append(f"✅ Return code: {proc.returncode}") | |
| return "\n".join(parts) | |
| except Exception as e: | |
| return f"❌ Failed to execute {nb_name}: {repr(e)}" | |
| def run_data_prep() -> str: | |
| return _run_notebook(DATA_NOTEBOOK, "1_Data_Creation_RUN.ipynb") | |
| def run_python_model() -> str: | |
| return _run_notebook(PY_NOTEBOOK, "2_Python_Analysis_RUN.ipynb") | |
| def run_r_model() -> str: | |
| return _run_notebook(R_NOTEBOOK, "3_R_Analysis_RUN.ipynb") | |
| def run_all() -> str: | |
| logs = [] | |
| logs.append(run_data_prep()) | |
| logs.append("\n" + "=" * 80 + "\n") | |
| logs.append(run_python_model()) | |
| logs.append("\n" + "=" * 80 + "\n") | |
| logs.append(run_r_model()) | |
| return "\n".join(logs) | |
| # ============================================================ | |
| # 2) Safe file readers for Results tab | |
| # ============================================================ | |
| def _safe_read_json(path: Path) -> Optional[Dict[str, Any]]: | |
| if not path.exists(): | |
| return None | |
| try: | |
| with open(path, "r", encoding="utf-8") as f: | |
| return json.load(f) | |
| except Exception: | |
| return None | |
| def _safe_read_csv(path: Path, nrows: Optional[int] = None) -> Optional[pd.DataFrame]: | |
| if not path.exists(): | |
| return None | |
| try: | |
| return pd.read_csv(path, nrows=nrows) | |
| except Exception: | |
| return None | |
| def load_results(): | |
| """ | |
| Load latest artifacts from artifacts/py and artifacts/r. | |
| Returns values in the order used by the Gradio outputs. | |
| """ | |
| # -------------------- | |
| # Python artifacts | |
| # -------------------- | |
| py_metrics = _safe_read_json(BASE_DIR / "artifacts" / "py" / "metrics" / "metrics.json") | |
| if py_metrics is None: | |
| py_metrics = {} | |
| py_conf_path = BASE_DIR / "artifacts" / "py" / "figures" / "confusion_matrix.png" | |
| py_roc_path = BASE_DIR / "artifacts" / "py" / "figures" / "roc_curve.png" | |
| py_fi = _safe_read_csv(BASE_DIR / "artifacts" / "py" / "tables" / "feature_importances.csv") | |
| if py_fi is None: | |
| py_fi = pd.DataFrame() | |
| py_pred = _safe_read_csv(BASE_DIR / "artifacts" / "py" / "tables" / "test_predictions.csv", nrows=50) | |
| if py_pred is None: | |
| py_pred = pd.DataFrame() | |
| # -------------------- | |
| # R artifacts | |
| # -------------------- | |
| r_metrics = _safe_read_json(BASE_DIR / "artifacts" / "r" / "metrics" / "metrics.json") | |
| if r_metrics is None: | |
| r_metrics = {} | |
| r_roc_path = BASE_DIR / "artifacts" / "r" / "figures" / "roc_curve.png" | |
| r_coef = _safe_read_csv(BASE_DIR / "artifacts" / "r" / "tables" / "coefficients.csv", nrows=50) | |
| if r_coef is None: | |
| r_coef = pd.DataFrame() | |
| r_pred = _safe_read_csv(BASE_DIR / "artifacts" / "r" / "tables" / "test_predictions.csv", nrows=50) | |
| if r_pred is None: | |
| r_pred = pd.DataFrame() | |
| # Gradio Image(type="filepath") works best with: | |
| # - a string path if the file exists | |
| # - None if it does not exist | |
| py_conf = str(py_conf_path) if py_conf_path.exists() else None | |
| py_roc = str(py_roc_path) if py_roc_path.exists() else None | |
| r_roc = str(r_roc_path) if r_roc_path.exists() else None | |
| return py_metrics, r_metrics, py_conf, py_roc, r_roc, py_fi, r_coef, py_pred, r_pred | |
| # ============================================================ | |
| # 3) Prediction (Python + R) | |
| # ============================================================ | |
| def _load_schema() -> Dict[str, Any]: | |
| if not FEATURES_PATH.exists(): | |
| raise FileNotFoundError("features.json not found. Run the Data Creation notebook first.") | |
| with open(FEATURES_PATH, "r", encoding="utf-8") as f: | |
| return json.load(f) | |
| def _predict_python(py_model, features: Dict[str, Any]) -> float: | |
| """ | |
| Predict cancellation probability using sklearn pipeline (joblib). | |
| """ | |
| schema = _load_schema() | |
| cols = schema["features"] | |
| X = pd.DataFrame([{c: features[c] for c in cols}]) | |
| return float(py_model.predict_proba(X)[:, 1][0]) | |
| def _predict_r(features: Dict[str, Any]) -> float: | |
| """ | |
| Predict cancellation probability using saved R glmnet model. | |
| Uses Rscript subprocess. Requires R installed in Docker image. | |
| """ | |
| if not R_MODEL_PATH.exists(): | |
| raise FileNotFoundError("R model not found. Run the R notebook first.") | |
| if not DATASET_PATH.exists(): | |
| raise FileNotFoundError("hotel_cancel_model_dataset.csv not found. Run the Data Creation notebook first.") | |
| if not R_METRICS_PATH.exists(): | |
| raise FileNotFoundError("R metrics not found. Run the R notebook first.") | |
| # Write input to temp file | |
| tmp_input = BASE_DIR / "tmp_r_input.json" | |
| with open(tmp_input, "w", encoding="utf-8") as f: | |
| json.dump(features, f) | |
| r_script = f""" | |
| suppressPackageStartupMessages(library(jsonlite)) | |
| suppressPackageStartupMessages(library(glmnet)) | |
| suppressPackageStartupMessages(library(Matrix)) | |
| dataset_path <- "{DATASET_PATH.as_posix()}" | |
| features_path <- "{FEATURES_PATH.as_posix()}" | |
| model_path <- "{R_MODEL_PATH.as_posix()}" | |
| metrics_path <- "{R_METRICS_PATH.as_posix()}" | |
| input_path <- "{tmp_input.as_posix()}" | |
| df <- read.csv(dataset_path, stringsAsFactors = FALSE) | |
| schema <- fromJSON(features_path) | |
| FEATURES <- schema$features | |
| metrics <- fromJSON(metrics_path) | |
| lambda_1se <- metrics$lambda_1se | |
| fit <- readRDS(model_path) | |
| inp <- fromJSON(input_path) | |
| x_df <- as.data.frame(inp, stringsAsFactors = FALSE) | |
| for (c in FEATURES) {{ | |
| if (is.null(x_df[[c]])) stop(paste("Missing input feature:", c)) | |
| if (is.character(df[[c]]) || is.character(x_df[[c]])) {{ | |
| levs <- unique(df[[c]]) | |
| x_df[[c]] <- factor(x_df[[c]], levels = levs) | |
| }} | |
| }} | |
| f <- as.formula(paste("~", paste(FEATURES, collapse = " + "))) | |
| X <- sparse.model.matrix(f, data = x_df)[, -1, drop = FALSE] | |
| p <- as.numeric(predict(fit, newx = X, s = lambda_1se, type = "response"))[1] | |
| cat(p) | |
| """ | |
| proc = subprocess.run(["Rscript", "-e", r_script], capture_output=True, text=True) | |
| # Cleanup temp file | |
| try: | |
| tmp_input.unlink(missing_ok=True) | |
| except Exception: | |
| pass | |
| if proc.returncode != 0: | |
| raise RuntimeError(f"R prediction failed:\n{proc.stderr}") | |
| try: | |
| return float(proc.stdout.strip()) | |
| except ValueError: | |
| raise RuntimeError(f"Could not parse R output as float.\nSTDOUT:\n{proc.stdout}\nSTDERR:\n{proc.stderr}") | |
| def predict_both( | |
| lead_time: float, | |
| average_price: float, | |
| total_nights: float, | |
| total_guests: float, | |
| market_segment_type: str, | |
| type_of_meal: str, | |
| special_requests: float, | |
| price_per_guest: float, | |
| ): | |
| """ | |
| Gradio callback: predict with both models. | |
| """ | |
| features = { | |
| "lead_time": float(lead_time), | |
| "average_price": float(average_price), | |
| "total_nights": float(total_nights), | |
| "total_guests": float(total_guests), | |
| "market_segment_type": str(market_segment_type), | |
| "type_of_meal": str(type_of_meal), | |
| "special_requests": float(special_requests), | |
| "price_per_guest": float(price_per_guest), | |
| } | |
| # Python model prediction | |
| if not PY_MODEL_PATH.exists(): | |
| raise FileNotFoundError("Python model not found. Run the Python notebook first.") | |
| py_model = joblib.load(PY_MODEL_PATH) | |
| py_proba = _predict_python(py_model, features) | |
| # R model prediction | |
| r_proba = _predict_r(features) | |
| py_text = f"Python (Random Forest) cancellation probability: **{py_proba*100:.1f}%**" | |
| r_text = f"R (LASSO Logistic Regression) cancellation probability: **{r_proba*100:.1f}%**" | |
| comp_df = pd.DataFrame( | |
| [ | |
| {"model": "Python Random Forest", "p_cancel_%": round(py_proba * 100, 1)}, | |
| {"model": "R LASSO Logistic Regression", "p_cancel_%": round(r_proba * 100, 1)}, | |
| ] | |
| ) | |
| return py_text, r_text, comp_df | |
| # ============================================================ | |
| # 4) Dropdown choices (from dataset categories) | |
| # ============================================================ | |
| def get_dropdown_choices(): | |
| """ | |
| Populate dropdown choices from the dataset (so categories match training). | |
| If dataset isn't available yet, return fallback defaults. | |
| """ | |
| if not DATASET_PATH.exists(): | |
| return (["Online", "Offline", "Corporate"], ["Meal Plan 1", "Meal Plan 2", "Not Selected"]) | |
| df = pd.read_csv(DATASET_PATH) | |
| market_choices = sorted(df["market_segment_type"].dropna().unique().tolist()) | |
| meal_choices = sorted(df["type_of_meal"].dropna().unique().tolist()) | |
| return market_choices, meal_choices | |
| # ============================================================ | |
| # 5) Build Gradio UI (3 tabs) | |
| # ============================================================ | |
| with gr.Blocks( | |
| title="Hotel Booking Cancellation Prediction", | |
| theme=gr.themes.Soft(primary_hue="blue"), | |
| css=open("style.css").read() | |
| ) as demo: | |
| gr.Markdown( | |
| """ | |
| # 🏨 Hotel Booking Cancellation Prediction | |
| This app runs the full pipeline and compares two models: | |
| - **Python : Random Forest** | |
| - **R : LASSO Logistic Regression** | |
| **Tabs** | |
| 1) Run Models | |
| 2) Results & Visualizations | |
| 3) Predict Cancellation Probability (both models) | |
| """ | |
| ) | |
| # ----------------------------- | |
| # TAB 1: Run Models | |
| # ----------------------------- | |
| with gr.Tab("1. Run Model"): | |
| gr.Markdown("Run each step and inspect the execution logs.") | |
| with gr.Row(): | |
| btn_data = gr.Button("Run Data Creation") | |
| btn_py = gr.Button("Run Python Analysis") | |
| btn_r = gr.Button("Run R Analysis") | |
| btn_all = gr.Button("Run All (1→2→3)") | |
| log_box = gr.Textbox( | |
| label="Execution Log", | |
| lines=22, | |
| value="Click a button to run a step. Logs will appear here.", | |
| ) | |
| btn_data.click(fn=run_data_prep, outputs=log_box) | |
| btn_py.click(fn=run_python_model, outputs=log_box) | |
| btn_r.click(fn=run_r_model, outputs=log_box) | |
| btn_all.click(fn=run_all, outputs=log_box) | |
| # ----------------------------- | |
| # TAB 2: Results & Visualizations | |
| # ----------------------------- | |
| with gr.Tab("2. Results & Visualizations"): | |
| gr.Markdown("Loads the latest saved artifacts from **artifacts/py/** and **artifacts/r/**.") | |
| btn_refresh = gr.Button("Refresh Results") | |
| with gr.Row(): | |
| py_metrics_view = gr.JSON(label="Python Metrics (metrics.json)") | |
| r_metrics_view = gr.JSON(label="R Metrics (metrics.json)") | |
| with gr.Row(): | |
| py_conf_img = gr.Image(label="Python Confusion Matrix", type="filepath") | |
| py_roc_img = gr.Image(label="Python ROC Curve", type="filepath") | |
| r_roc_img = gr.Image(label="R ROC Curve", type="filepath") | |
| with gr.Row(): | |
| py_fi_table = gr.Dataframe(label="Python Feature Importances (top)", interactive=False) | |
| r_coef_table = gr.Dataframe(label="R Coefficients (top)", interactive=False) | |
| with gr.Row(): | |
| py_pred_table = gr.Dataframe(label="Python Test Predictions (top 50)", interactive=False) | |
| r_pred_table = gr.Dataframe(label="R Test Predictions (top 50)", interactive=False) | |
| def _refresh(): | |
| return load_results() | |
| btn_refresh.click( | |
| fn=_refresh, | |
| outputs=[ | |
| py_metrics_view, r_metrics_view, | |
| py_conf_img, py_roc_img, r_roc_img, | |
| py_fi_table, r_coef_table, | |
| py_pred_table, r_pred_table, | |
| ], | |
| ) | |
| # ----------------------------- | |
| # TAB 3: Predict | |
| # ----------------------------- | |
| with gr.Tab("3. Predictor"): | |
| gr.Markdown( | |
| "Enter booking details and predict cancellation probability with **both models**.\n" | |
| "Dropdown values are taken from the dataset categories." | |
| ) | |
| market_choices, meal_choices = get_dropdown_choices() | |
| with gr.Row(): | |
| lead_time = gr.Number(label="lead_time", value=30) | |
| average_price = gr.Number(label="average_price", value=100) | |
| with gr.Row(): | |
| total_nights = gr.Number(label="total_nights", value=3) | |
| total_guests = gr.Number(label="total_guests", value=2) | |
| with gr.Row(): | |
| market_segment_type = gr.Dropdown( | |
| label="market_segment_type", | |
| choices=market_choices, | |
| value=market_choices[0] if market_choices else None, | |
| ) | |
| type_of_meal = gr.Dropdown( | |
| label="type_of_meal", | |
| choices=meal_choices, | |
| value=meal_choices[0] if meal_choices else None, | |
| ) | |
| with gr.Row(): | |
| special_requests = gr.Number(label="special_requests", value=1) | |
| price_per_guest = gr.Number(label="price_per_guest", value=50) | |
| btn_predict = gr.Button("Predict Cancellation Probability") | |
| py_pred_text = gr.Markdown() | |
| r_pred_text = gr.Markdown() | |
| comp_table = gr.Dataframe(label="Model Comparison", interactive=False) | |
| btn_predict.click( | |
| fn=predict_both, | |
| inputs=[ | |
| lead_time, average_price, | |
| total_nights, total_guests, | |
| market_segment_type, type_of_meal, | |
| special_requests, price_per_guest, | |
| ], | |
| outputs=[py_pred_text, r_pred_text, comp_table], | |
| ) | |
| # ============================================================ | |
| # 6) Launch | |
| # ============================================================ | |
| if __name__ == "__main__": | |
| import sys | |
| import traceback | |
| try: | |
| print("✅ app.py starting...", flush=True) | |
| # Hugging Face may provide PORT environment variable | |
| port = int(os.getenv("PORT", os.getenv("GRADIO_SERVER_PORT", "7860"))) | |
| host = os.getenv("GRADIO_SERVER_NAME", "0.0.0.0") | |
| print(f"✅ Launching Gradio on {host}:{port}", flush=True) | |
| demo.launch( | |
| server_name=host, | |
| server_port=port, | |
| debug=True, | |
| show_error=True | |
| ) | |
| except Exception: | |
| print("❌ App crashed during startup:", flush=True) | |
| traceback.print_exc() | |
| sys.exit(1) |