""" Database Analysis Script. This script connects to the local ChromaDB vector store and performs a quick analysis of the ingested clinical trial data. It prints statistics about: - Top Sponsors - Phase Distribution - Status Distribution - Top Medical Conditions - Sample of Recent Studies Usage: python scripts/analyze_db.py # OR cd scripts && python analyze_db.py """ import lancedb import pandas as pd import os def analyze_db(): """ Connects to ChromaDB and prints summary statistics of the dataset. """ # Determine the project root directory (one level up from this script) script_dir = os.path.dirname(os.path.abspath(__file__)) project_root = os.path.dirname(script_dir) db_path = os.path.join(project_root, "ct_gov_lancedb") if not os.path.exists(db_path): print(f"āŒ Database directory '{db_path}' does not exist.") print(" Please run 'python scripts/ingest_ct.py' first to ingest data.") return print(f"šŸ“‚ Loading database from {db_path}...") try: db = lancedb.connect(db_path) # Check for table existence if "clinical_trials" not in db.table_names(): print(f"āŒ Table 'clinical_trials' not found. Available: {db.table_names()}") return tbl = db.open_table("clinical_trials") count = len(tbl) print(f"āœ… Found 'clinical_trials' table with {count} documents.") # Fetch all data for analysis df = tbl.to_pandas() if df.empty: print("āŒ No data found.") return # Handle metadata if nested (LlamaIndex might nest it) if "metadata" in df.columns: # Try to flatten if it's a struct/dict try: meta_df = pd.json_normalize(df["metadata"]) # Merge with original df or just use meta_df for analysis # We'll use meta_df for the metadata fields analysis # But we might need 'text' from original df = pd.concat([df.drop(columns=["metadata"]), meta_df], axis=1) except: pass if "nct_id" in df.columns: unique_ncts = df["nct_id"].nunique() print(f"šŸ”¢ Unique NCT IDs: {unique_ncts}") if unique_ncts < count: print(f"āš ļø Warning: {count - unique_ncts} duplicate records found!") else: print("āš ļø 'nct_id' field not found in metadata.") # --- Analysis Sections --- print("\nšŸ“Š --- Top 10 Sponsors ---") if "org" in df.columns: print(df["org"].value_counts().head(10)) else: print("āš ļø 'org' field not found in metadata.") print("\nšŸ“Š --- Phase Distribution ---") if "phase" in df.columns: print(df["phase"].value_counts()) else: print("āš ļø 'phase' field not found in metadata.") print("\nšŸ“Š --- Status Distribution ---") if "status" in df.columns: print(df["status"].value_counts()) else: print("āš ļø 'status' field not found in metadata.") print("\nšŸ“Š --- Top Conditions ---") if "condition" in df.columns: # Conditions are comma-separated strings, so we split and explode them all_conditions = [] for conditions in df["condition"].dropna(): all_conditions.extend([c.strip() for c in conditions.split(",")]) print(pd.Series(all_conditions).value_counts().head(10)) else: print("āš ļø 'condition' field not found in metadata.") print("\nšŸ“Š --- Top Interventions ---") if "intervention" in df.columns: # Interventions are semicolon-separated strings (from ingest_ct.py), so we split by "; " all_interventions = [] for interventions in df["intervention"].dropna(): # Split by semicolon and strip whitespace parts = [i.strip() for i in interventions.split(";") if i.strip()] all_interventions.extend(parts) if all_interventions: print(pd.Series(all_interventions).value_counts().head(20)) else: print("No interventions found.") else: print("āš ļø 'intervention' field not found in metadata.") print("\nšŸ“ --- Sample Studies (Most Recent Start Years) ---") if "start_year" in df.columns and "title" in df.columns: # Ensure start_year is numeric for sorting df["start_year"] = pd.to_numeric(df["start_year"], errors="coerce") top_recent = df.sort_values(by="start_year", ascending=False).head(5) for _, row in top_recent.iterrows(): print( f"- [{row.get('start_year', 'N/A')}] {row.get('title', 'N/A')} ({row.get('nct_id', 'N/A')})" ) print(f" Sponsor: {row.get('org', 'N/A')}") print(f" Intervention: {row.get('intervention', 'N/A')}") print("\nšŸ“Š --- Intervention Check ---") if "intervention" in df.columns: non_empty = df[df["intervention"].str.len() > 0] print(f"Total records with interventions: {len(non_empty)}") if not non_empty.empty: print("Sample Intervention:", non_empty.iloc[0]["intervention"]) else: print("āš ļø 'intervention' field not found.") except Exception as e: print(f"āš ļø Error analyzing DB: {e}") if __name__ == "__main__": analyze_db()