Spaces:

SWE-Arena
/

SWE-PR

Sleeping

App Files Files Community

zhimin-z commited on Nov 14

Commit

a15931a

1 Parent(s): e7d88ff

refine

Browse files

Files changed (7) hide show

.gitignore +1 -1
Dockerfile +6 -18
README.md +1 -1
app.py +292 -1141
docker-compose.yml +21 -0
msr.py +635 -475
requirements.txt +3 -5

.gitignore CHANGED Viewed

@@ -2,4 +2,4 @@
 *.env
 *.venv
 *.ipynb
-*.pyc

 *.env
 *.venv
 *.ipynb
+*.pyc

Dockerfile CHANGED Viewed

@@ -1,34 +1,22 @@
-# Use official Python runtime as base image
 FROM python:3.12-slim
 # Set working directory
 WORKDIR /app
-# Install system dependencies (if needed)
 RUN apt-get update && apt-get install -y \
-    git \
     && rm -rf /var/lib/apt/lists/*
-# Copy requirements.txt
 COPY requirements.txt .
 # Install Python dependencies
 RUN pip install --no-cache-dir -r requirements.txt
-# Copy application files
-COPY .env .
-COPY msr.py .
-# Create a non-root user for security (optional but recommended)
-RUN useradd -m -u 1000 appuser && chown -R appuser:appuser /app
-USER appuser
-# Expose port for Gradio web interface (default is 7860)
-EXPOSE 7860
 # Set environment variables
-ENV GRADIO_SERVER_NAME=0.0.0.0
-ENV GRADIO_SERVER_PORT=7860
-# Run the Gradio app
 CMD ["python", "msr.py"]

 FROM python:3.12-slim
 # Set working directory
 WORKDIR /app
+# Install system dependencies
 RUN apt-get update && apt-get install -y \
+    gcc \
+    g++ \
     && rm -rf /var/lib/apt/lists/*
+# Copy requirements file
 COPY requirements.txt .
 # Install Python dependencies
 RUN pip install --no-cache-dir -r requirements.txt
 # Set environment variables
+ENV PYTHONUNBUFFERED=1
+# Run the mining script with scheduler
 CMD ["python", "msr.py"]

README.md CHANGED Viewed

@@ -57,7 +57,7 @@ We search GitHub using multiple query patterns to catch all PRs associated with
 - Co-authored commits (`co-authored-by:`)
 **Regular Updates**
-The leaderboard refreshes automatically every day at 12:00 AM UTC.
 **Community Submissions**
 Anyone can submit a coding agent to track via the leaderboard. We store agent metadata in Hugging Face datasets (`SWE-Arena/bot_metadata`) and issue metadata in (`SWE-Arena/issue_metadata`). The leaderboard is dynamically constructed from the issue metadata. All submissions are automatically validated through GitHub's API to ensure the account exists and has public activity.

 - Co-authored commits (`co-authored-by:`)
 **Regular Updates**
+The leaderboard refreshes automatically on the 8nd of each month at 12:00 AM UTC.
 **Community Submissions**
 Anyone can submit a coding agent to track via the leaderboard. We store agent metadata in Hugging Face datasets (`SWE-Arena/bot_metadata`) and issue metadata in (`SWE-Arena/issue_metadata`). The leaderboard is dynamically constructed from the issue metadata. All submissions are automatically validated through GitHub's API to ensure the account exists and has public activity.

app.py CHANGED Viewed

@@ -3,21 +3,17 @@ from gradio_leaderboard import Leaderboard, ColumnFilter
 import json
 import os
 import time
-import tempfile
 import requests
-from datetime import datetime, timezone, timedelta
-from collections import defaultdict
 from huggingface_hub import HfApi, hf_hub_download
 from huggingface_hub.errors import HfHubHTTPError
 from dotenv import load_dotenv
 import pandas as pd
-import backoff
 import random
 import plotly.graph_objects as go
 from plotly.subplots import make_subplots
 from apscheduler.schedulers.background import BackgroundScheduler
 from apscheduler.triggers.cron import CronTrigger
-from google.cloud import bigquery
 # Load environment variables
 load_dotenv()
@@ -27,10 +23,7 @@ load_dotenv()
 # =============================================================================
 AGENTS_REPO = "SWE-Arena/bot_metadata"  # HuggingFace dataset for agent metadata
-PR_METADATA_REPO = "SWE-Arena/pr_metadata"  # HuggingFace dataset for PR metadata
-LEADERBOARD_REPO = "SWE-Arena/leaderboard_metadata"  # For storing computed leaderboard data
-LEADERBOARD_TIME_FRAME_DAYS = 180  # Time frame for constructing leaderboard
-UPDATE_TIME_FRAME_DAYS = 30  # Time frame for mining new PRs
 LEADERBOARD_COLUMNS = [
     ("Agent Name", "string"),
@@ -40,71 +33,8 @@ LEADERBOARD_COLUMNS = [
     ("Acceptance Rate (%)", "number"),
 ]
-# Global cache for leaderboard data (loaded once at startup)
-_LEADERBOARD_CACHE = None
 # =============================================================================
-# JSONL FILE OPERATIONS
-# =============================================================================
-def load_jsonl(filename):
-    """Load JSONL file and return list of dictionaries."""
-    if not os.path.exists(filename):
-        return []
-    data = []
-    with open(filename, 'r', encoding='utf-8') as f:
-        for line in f:
-            line = line.strip()
-            if line:
-                try:
-                    entry = json.loads(line)
-                    data.append(entry)
-                except json.JSONDecodeError as e:
-                    print(f"Warning: Skipping invalid JSON line: {e}")
-    return data
-def save_jsonl(filename, data):
-    """Save list of dictionaries to JSONL file."""
-    with open(filename, 'w', encoding='utf-8') as f:
-        for item in data:
-            f.write(json.dumps(item) + '\n')
-def parse_date_string(date_string):
-    """
-    Parse date string to datetime object, handling various formats.
-    Handles:
-    - ISO format with 'T' or space between date and time
-    - Timezone with 'Z' or incomplete offset (+00, -00)
-    - Complete timezone offset (+00:00, -00:00)
-    Args:
-        date_string: Date string in various formats
-    Returns:
-        datetime object or raises exception
-    """
-    if not date_string:
-        raise ValueError("Empty date string")
-    # Replace space with 'T' for ISO format compatibility
-    date_string = date_string.replace(' ', 'T')
-    # Fix incomplete timezone offset (+00 or -00 -> +00:00 or -00:00)
-    if date_string[-3:-2] in ('+', '-') and ':' not in date_string[-3:]:
-        date_string = date_string + ':00'
-    # Parse the date string (handles both with and without microseconds)
-    dt = datetime.fromisoformat(date_string.replace('Z', '+00:00'))
-    return dt
-# =============================================================================
-# HUGGINGFACE API RETRY WRAPPERS
 # =============================================================================
 def is_rate_limit_error(e):
@@ -114,368 +44,123 @@ def is_rate_limit_error(e):
     return False
-def backoff_handler(details):
-    """Handler to print retry attempt information."""
-    wait_time = details['wait']
-    tries = details['tries']
-    wait_minutes = wait_time / 60
-    print(f"   ⏳ Rate limited. Retrying in {wait_minutes:.1f} minutes ({wait_time:.0f}s) - attempt {tries}/8...")
 @backoff.on_exception(
     backoff.expo,
     HfHubHTTPError,
-    giveup=lambda e: not is_rate_limit_error(e),
     max_tries=8,
-    base=300,  # Start at 5 minutes (300 seconds)
-    max_value=3600,  # Cap at 60 minutes (3600 seconds)
-    jitter=backoff.full_jitter,
-    on_backoff=backoff_handler
 )
 def list_repo_files_with_backoff(api, **kwargs):
-    """Wrapper for HfApi.list_repo_files with exponential backoff on rate limits."""
     return api.list_repo_files(**kwargs)
 @backoff.on_exception(
     backoff.expo,
     HfHubHTTPError,
-    giveup=lambda e: not is_rate_limit_error(e),
     max_tries=8,
-    base=300,  # Start at 5 minutes (300 seconds)
-    max_value=3600,  # Cap at 60 minutes (3600 seconds)
-    jitter=backoff.full_jitter,
-    on_backoff=backoff_handler
 )
 def hf_hub_download_with_backoff(**kwargs):
-    """Wrapper for hf_hub_download with exponential backoff on rate limits."""
     return hf_hub_download(**kwargs)
-@backoff.on_exception(
-    backoff.expo,
-    HfHubHTTPError,
-    giveup=lambda e: not is_rate_limit_error(e),
-    max_tries=8,
-    base=300,  # Start at 5 minutes (300 seconds)
-    max_value=3600,  # Cap at 60 minutes (3600 seconds)
-    jitter=backoff.full_jitter,
-    on_backoff=backoff_handler
-)
-def upload_folder_with_backoff(api, **kwargs):
-    """Wrapper for HfApi.upload_folder with exponential backoff on rate limits."""
-    return api.upload_folder(**kwargs)
-@backoff.on_exception(
-    backoff.expo,
-    HfHubHTTPError,
-    giveup=lambda e: not is_rate_limit_error(e),
-    max_tries=8,
-    base=300,  # Start at 5 minutes (300 seconds)
-    max_value=3600,  # Cap at 60 minutes (3600 seconds)
-    jitter=backoff.full_jitter,
-    on_backoff=backoff_handler
-)
-def upload_file_with_backoff(api, **kwargs):
-    """Wrapper for HfApi.upload_file with exponential backoff on rate limits."""
-    return api.upload_file(**kwargs)
 # =============================================================================
-# BIGQUERY FUNCTIONS
 # =============================================================================
-def fetch_issue_metadata_batched(client, identifiers, start_date, end_date, batch_size=100, upload_immediately=True):
     """
-    Fetch issue metadata for ALL agents using BATCHED BigQuery queries.
-    Splits agents into smaller batches to avoid performance issues with large numbers of agents.
-    Args:
-        client: BigQuery client instance
-        identifiers: List of GitHub usernames/bot identifiers
-        start_date: Start datetime (timezone-aware)
-        end_date: End datetime (timezone-aware)
-        batch_size: Number of agents to process per batch (default: 100)
-        upload_immediately: If True, upload each batch's results to HuggingFace immediately (default: True)
-    Returns:
-        Dictionary mapping agent identifier to list of issue metadata
     """
-    # Split identifiers into batches
-    batches = [identifiers[i:i + batch_size] for i in range(0, len(identifiers), batch_size)]
-    total_batches = len(batches)
-    print(f"\n🔍 Using BATCHED approach for {len(identifiers)} agents")
-    print(f"   Total batches: {total_batches} (batch size: {batch_size})")
-    print(f"   Time range: {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}")
-    if upload_immediately:
-        print(f"   Upload mode: Immediate (after each batch)")
-    else:
-        print(f"   Upload mode: Deferred (all at once)")
-    # Collect results from all batches
-    all_metadata = {}
-    for batch_num, batch_identifiers in enumerate(batches, 1):
-        print(f"\n📦 Processing batch {batch_num}/{total_batches} ({len(batch_identifiers)} agents)...")
         try:
-            # Query each batch
-            batch_results = fetch_all_pr_metadata_single_query(
-                client, batch_identifiers, start_date, end_date
             )
-            # Merge results
-            for identifier, metadata_list in batch_results.items():
-                if identifier in all_metadata:
-                    all_metadata[identifier].extend(metadata_list)
-                else:
-                    all_metadata[identifier] = metadata_list
-            print(f"   ✓ Batch {batch_num}/{total_batches} complete")
-            # Upload immediately after this batch if enabled
-            if upload_immediately and batch_results:
-                print(f"\n   📤 Uploading batch {batch_num}/{total_batches} results to HuggingFace...")
-                upload_success = 0
-                upload_errors = 0
-                for identifier, metadata_list in batch_results.items():
-                    if metadata_list:
-                        if save_pr_metadata_to_hf(metadata_list, identifier):
-                            upload_success += 1
-                        else:
-                            upload_errors += 1
-                print(f"   ✓ Batch {batch_num}/{total_batches} upload complete ({upload_success} agents uploaded, {upload_errors} errors)")
-        except Exception as e:
-            print(f"   ✗ Batch {batch_num}/{total_batches} failed: {str(e)}")
-            print(f"   Continuing with remaining batches...")
-            continue
-    total_prs = sum(len(metadata_list) for metadata_list in all_metadata.values())
-    print(f"\n✓ All batches complete! Found {total_prs} total PRs across {len(all_metadata)} agents")
-    return all_metadata
-def get_bigquery_client():
-    """
-    Initialize BigQuery client using credentials from environment variable.
-    Expects GOOGLE_APPLICATION_CREDENTIALS_JSON environment variable containing
-    the service account JSON credentials as a string.
-    """
-    # Get the JSON content from environment variable
-    creds_json = os.environ.get('GOOGLE_APPLICATION_CREDENTIALS_JSON')
-    if creds_json:
-        # Create a temporary file to store credentials
-        with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.json') as temp_file:
-            temp_file.write(creds_json)
-            temp_path = temp_file.name
-        # Set environment variable to point to temp file
-        os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = temp_path
-        # Initialize BigQuery client
-        client = bigquery.Client()
-        # Clean up temp file
-        os.unlink(temp_path)
-        return client
-    else:
-        raise ValueError("GOOGLE_APPLICATION_CREDENTIALS_JSON not found in environment")
-def generate_table_union_statements(start_date, end_date):
-    """
-    Generate UNION ALL statements for githubarchive.month tables in date range.
-    Args:
-        start_date: Start datetime
-        end_date: End datetime
-    Returns:
-        String with UNION ALL SELECT statements for all tables in range
-    """
-    table_names = []
-    # Start from the beginning of start_date's month
-    current_date = start_date.replace(day=1)
-    end_month = end_date.replace(day=1)
-    while current_date <= end_month:
-        table_name = f"`githubarchive.month.{current_date.strftime('%Y%m')}`"
-        table_names.append(table_name)
-        # Move to next month
-        if current_date.month == 12:
-            current_date = current_date.replace(year=current_date.year + 1, month=1)
-        else:
-            current_date = current_date.replace(month=current_date.month + 1)
-    # Create UNION ALL chain
-    union_parts = [f"SELECT * FROM {table}" for table in table_names]
-    return " UNION ALL ".join(union_parts)
-def fetch_all_pr_metadata_single_query(client, identifiers, start_date, end_date):
-    """
-    Fetch PR metadata for a BATCH of agents using ONE comprehensive BigQuery query.
-    NOTE: This function is designed for smaller batches (~100 agents).
-    For large numbers of agents, use fetch_issue_metadata_batched() instead.
-    This query fetches PRs authored by agents (user.login matches identifier).
-    Args:
-        client: BigQuery client instance
-        identifiers: List of GitHub usernames/bot identifiers
-        start_date: Start datetime (timezone-aware)
-        end_date: End datetime (timezone-aware)
-    Returns:
-        Dictionary mapping agent identifier to list of PR metadata
-    """
-    print(f"   Querying BigQuery for {len(identifiers)} agents in this batch...")
-    print(f"   Time range: {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}")
-    # Generate table UNION statements for the time range
-    table_union = generate_table_union_statements(start_date, end_date)
-    # Build identifier list for SQL IN clause (author matching only)
-    author_list = ', '.join([f"'{id}'" for id in identifiers])
-    # Build comprehensive query with CTE
-    query = f"""
-    WITH pr_events AS (
-      -- Get all PR events (opened, closed) for all agents
-      SELECT
-        JSON_EXTRACT_SCALAR(payload, '$.pull_request.html_url') as html_url,
-        JSON_EXTRACT_SCALAR(payload, '$.pull_request.user.login') as pr_author,
-        JSON_EXTRACT_SCALAR(payload, '$.pull_request.created_at') as created_at,
-        CAST(JSON_EXTRACT_SCALAR(payload, '$.pull_request.merged') AS BOOL) as is_merged,
-        JSON_EXTRACT_SCALAR(payload, '$.pull_request.merged_at') as merged_at,
-        JSON_EXTRACT_SCALAR(payload, '$.pull_request.closed_at') as closed_at,
-        JSON_EXTRACT_SCALAR(payload, '$.action') as action,
-        created_at as event_time
-      FROM (
-        {table_union}
-      ) t
-      WHERE
-        type = 'PullRequestEvent'
-        AND JSON_EXTRACT_SCALAR(payload, '$.pull_request.html_url') IS NOT NULL
-        AND JSON_EXTRACT_SCALAR(payload, '$.pull_request.user.login') IN ({author_list})
-    ),
-    pr_latest_state AS (
-      -- Get the latest state for each PR (most recent event)
-      SELECT
-        html_url,
-        pr_author,
-        created_at,
-        merged_at,
-        closed_at,
-        ROW_NUMBER() OVER (PARTITION BY html_url ORDER BY event_time DESC) as row_num
-      FROM pr_events
-    )
-    -- Return deduplicated PR metadata
-    SELECT DISTINCT
-      html_url,
-      pr_author,
-      created_at,
-      merged_at,
-      closed_at
-    FROM pr_latest_state
-    WHERE row_num = 1
-    ORDER BY created_at DESC
-    """
-    print(f"   Scanning {(end_date - start_date).days} days of GitHub Archive data...")
-    print(f"   Batch agents: {', '.join(identifiers[:5])}{'...' if len(identifiers) > 5 else ''}")
-    try:
-        query_job = client.query(query)
-        results = list(query_job.result())
-        print(f"   ✓ Found {len(results)} PRs in this batch")
-        # Group results by agent
-        metadata_by_agent = defaultdict(list)
-        for row in results:
-            # Convert datetime objects to ISO strings
-            created_at = row.created_at
-            if hasattr(created_at, 'isoformat'):
-                created_at = created_at.isoformat()
-            merged_at = row.merged_at
-            if hasattr(merged_at, 'isoformat'):
-                merged_at = merged_at.isoformat()
-            closed_at = row.closed_at
-            if hasattr(closed_at, 'isoformat'):
-                closed_at = closed_at.isoformat()
-            pr_data = {
-                'html_url': row.html_url,
-                'created_at': created_at,
-                'merged_at': merged_at,
-                'closed_at': closed_at,
-            }
-            # Assign to agent based on author
-            pr_author = row.pr_author
-            if pr_author and pr_author in identifiers:
-                metadata_by_agent[pr_author].append(pr_data)
-        # Print breakdown by agent (only show agents with PRs)
-        print(f"   📊 Batch breakdown:")
-        for identifier in identifiers:
-            count = len(metadata_by_agent.get(identifier, []))
-            if count > 0:
-                metadata = metadata_by_agent[identifier]
-                merged_count = sum(1 for m in metadata if m['merged_at'] is not None)
-                closed_count = sum(1 for m in metadata if m['closed_at'] is not None and m['merged_at'] is None)
-                open_count = count - merged_count - closed_count
-                print(f"      {identifier}: {count} PRs ({merged_count} merged, {closed_count} closed, {open_count} open)")
-        # Convert defaultdict to regular dict
-        return dict(metadata_by_agent)
-    except Exception as e:
-        print(f"   ✗ BigQuery error: {str(e)}")
-        import traceback
-        traceback.print_exc()
-        return {}
-# =============================================================================
-# GITHUB API OPERATIONS (Minimal - Only for Validation)
-# =============================================================================
-def get_github_token():
-    """Get first GitHub token from environment variables."""
-    token = os.getenv('GITHUB_TOKEN')
-    if not token:
-        print("Warning: GITHUB_TOKEN not found. Validation will be limited.")
-    return token
 def validate_github_username(identifier):
-    """Verify that a GitHub identifier exists (simple validation)."""
     try:
-        token = get_github_token()
-        headers = {'Authorization': f'token {token}'} if token else {}
         url = f'https://api.github.com/users/{identifier}'
-        response = requests.get(url, headers=headers, timeout=10)
         if response.status_code == 200:
             return True, "Username is valid"
         elif response.status_code == 404:
@@ -486,414 +171,6 @@ def validate_github_username(identifier):
         return False, f"Validation error: {str(e)}"
-# =============================================================================
-# PR STATISTICS
-# =============================================================================
-def calculate_pr_stats_from_metadata(metadata_list):
-    """
-    Calculate statistics from a list of PR metadata (lightweight objects).
-    Works with minimal metadata: html_url, created_at, merged_at, closed_at.
-    Returns a dictionary with comprehensive PR metrics.
-    Acceptance rate is calculated as:
-        merged PRs / (merged PRs + closed but not merged PRs) * 100
-    This only counts PRs where a decision has been made (either merged or rejected/closed).
-    """
-    total_prs = len(metadata_list)
-    merged = sum(1 for pr_meta in metadata_list if pr_meta.get('merged_at'))
-    # Count closed PRs (rejected) - those with closed_at but no merged_at
-    closed_not_merged = sum(1 for pr_meta in metadata_list
-                           if pr_meta.get('closed_at') and not pr_meta.get('merged_at'))
-    # Total decisions made = merged + closed (rejected)
-    total_decisions = merged + closed_not_merged
-    # Calculate acceptance rate based on decisions made
-    acceptance_rate = (merged / total_decisions * 100) if total_decisions > 0 else 0
-    return {
-        'total_prs': total_prs,
-        'merged_prs': merged,
-        'acceptance_rate': round(acceptance_rate, 2),
-    }
-def calculate_monthly_metrics_by_agent(top_n=None):
-    """
-    Calculate monthly metrics for all agents (or top N agents) for visualization.
-    Loads data directly from SWE-Arena/pr_metadata dataset.
-    Args:
-        top_n: If specified, only return metrics for the top N agents by total PRs.
-               Agents are ranked by their total PR count across all months.
-    Returns:
-        dict: {
-            'agents': list of agent names,
-            'months': list of month labels (e.g., '2025-01'),
-            'data': {
-                agent_name: {
-                    'acceptance_rates': list of acceptance rates by month,
-                    'total_prs': list of PR counts by month,
-                    'merged_prs': list of merged PR counts by month,
-                    'closed_not_merged': list of closed but not merged PR counts by month
-                }
-            }
-        }
-    """
-    # Load ALL agents from HuggingFace agents repo
-    agents = load_agents_from_hf()
-    # Create mapping from agent_identifier to agent_name
-    identifier_to_name = {agent.get('github_identifier'): agent.get('name', 'Unknown') for agent in agents if agent.get('github_identifier')}
-    # Load all PR metadata from pr_metadata dataset
-    all_metadata = load_pr_metadata()
-    if not all_metadata:
-        return {'agents': [], 'months': [], 'data': {}}
-    # Group by agent and month
-    agent_month_data = defaultdict(lambda: defaultdict(list))
-    for pr_meta in all_metadata:
-        agent_identifier = pr_meta.get('agent_identifier')
-        created_at = pr_meta.get('created_at')
-        if not agent_identifier or not created_at:
-            continue
-        # Get agent_name from identifier
-        agent_name = identifier_to_name.get(agent_identifier, agent_identifier)
-        try:
-            dt = parse_date_string(created_at)
-            month_key = f"{dt.year}-{dt.month:02d}"
-            agent_month_data[agent_name][month_key].append(pr_meta)
-        except Exception as e:
-            print(f"Warning: Could not parse date '{created_at}': {e}")
-            continue
-    # Get all unique months and sort them
-    all_months = set()
-    for agent_data in agent_month_data.values():
-        all_months.update(agent_data.keys())
-    months = sorted(list(all_months))
-    # Calculate metrics for each agent and month
-    result_data = {}
-    for agent_name, month_dict in agent_month_data.items():
-        acceptance_rates = []
-        total_prs = []
-        merged_prs = []
-        closed_not_merged_list = []
-        for month in months:
-            prs_in_month = month_dict.get(month, [])
-            # Count merged PRs
-            merged_count = sum(1 for pr in prs_in_month if pr.get('merged_at'))
-            # Count closed but not merged
-            closed_not_merged_count = sum(1 for pr in prs_in_month
-                                         if pr.get('closed_at') and not pr.get('merged_at'))
-            # Total PRs created in this month
-            total_count = len(prs_in_month)
-            # Calculate acceptance rate
-            total_decisions = merged_count + closed_not_merged_count
-            acceptance_rate = (merged_count / total_decisions * 100) if total_decisions > 0 else None
-            acceptance_rates.append(acceptance_rate)
-            total_prs.append(total_count)
-            merged_prs.append(merged_count)
-            closed_not_merged_list.append(closed_not_merged_count)
-        result_data[agent_name] = {
-            'acceptance_rates': acceptance_rates,
-            'total_prs': total_prs,
-            'merged_prs': merged_prs,
-            'closed_not_merged': closed_not_merged_list
-        }
-    # Filter to top N agents if specified
-    agents_list = sorted(list(agent_month_data.keys()))
-    if top_n is not None and top_n > 0:
-        # Calculate total PRs for each agent across all months
-        agent_totals = []
-        for agent_name in agents_list:
-            total_pr_count = sum(result_data[agent_name]['total_prs'])
-            agent_totals.append((agent_name, total_pr_count))
-        # Sort by total PRs (descending) and take top N
-        agent_totals.sort(key=lambda x: x[1], reverse=True)
-        top_agents = [agent_name for agent_name, _ in agent_totals[:top_n]]
-        # Filter result_data to only include top agents
-        result_data = {agent: result_data[agent] for agent in top_agents if agent in result_data}
-        agents_list = top_agents
-    return {
-        'agents': agents_list,
-        'months': months,
-        'data': result_data
-    }
-# =============================================================================
-# PR METADATA STORAGE & RETRIEVAL
-# =============================================================================
-def group_metadata_by_date(metadata_list):
-    """
-    Group PR metadata by exact date (year.month.day) for efficient daily storage.
-    Returns dict: {(year, month, day): [metadata_list]}
-    """
-    grouped = defaultdict(list)
-    for pr_meta in metadata_list:
-        created_at = pr_meta.get('created_at')
-        if not created_at:
-            continue
-        try:
-            dt = parse_date_string(created_at)
-            key = (dt.year, dt.month, dt.day)
-            grouped[key].append(pr_meta)
-        except Exception as e:
-            print(f"Warning: Could not parse date '{created_at}': {e}")
-    return dict(grouped)
-def save_pr_metadata_to_hf(metadata_list, agent_identifier):
-    """
-    Save PR metadata to HuggingFace dataset, organized by [agent_identifier]/YYYY.MM.DD.jsonl.
-    Each file is stored in the agent's folder and named YYYY.MM.DD.jsonl for that day's PRs.
-    This function OVERWRITES existing files completely with fresh data from BigQuery.
-    Uses batch upload to avoid rate limit (uploads entire folder in single operation).
-    Args:
-        metadata_list: List of PR metadata dictionaries
-        agent_identifier: GitHub identifier of the agent (used as folder name)
-    """
-    import shutil
-    try:
-        token = get_hf_token()
-        if not token:
-            raise Exception("No HuggingFace token found")
-        api = HfApi(token=token)
-        # Group by date (year, month, day)
-        grouped = group_metadata_by_date(metadata_list)
-        if not grouped:
-            print(f"   No valid metadata to save for {agent_identifier}")
-            return False
-        # Create a temporary directory for batch upload
-        temp_dir = tempfile.mkdtemp()
-        agent_folder = os.path.join(temp_dir, agent_identifier)
-        os.makedirs(agent_folder, exist_ok=True)
-        try:
-            print(f"   📦 Preparing batch upload for {len(grouped)} daily files...")
-            # Process each daily file
-            for (pr_year, month, day), day_metadata in grouped.items():
-                filename = f"{agent_identifier}/{pr_year}.{month:02d}.{day:02d}.jsonl"
-                local_filename = os.path.join(agent_folder, f"{pr_year}.{month:02d}.{day:02d}.jsonl")
-                # Sort by created_at for better organization
-                day_metadata.sort(key=lambda x: x.get('created_at', ''), reverse=True)
-                # Save to temp directory (complete overwrite, no merging)
-                save_jsonl(local_filename, day_metadata)
-                print(f"      Prepared {len(day_metadata)} PRs for {filename}")
-            # Upload entire folder using upload_folder (single commit per agent)
-            print(f"   📤 Uploading {len(grouped)} files ({len(metadata_list)} total PRs)...")
-            upload_folder_with_backoff(
-                api,
-                folder_path=temp_dir,
-                repo_id=PR_METADATA_REPO,
-                repo_type="dataset",
-                commit_message=f"Update PR metadata for {agent_identifier}"
-            )
-            print(f"   ✓ Batch upload complete for {agent_identifier}")
-            return True
-        finally:
-            # Always clean up temp directory
-            if os.path.exists(temp_dir):
-                shutil.rmtree(temp_dir)
-    except Exception as e:
-        print(f"   ✗ Error saving PR metadata: {str(e)}")
-        import traceback
-        traceback.print_exc()
-        return False
-def load_pr_metadata():
-    """
-    Loads PR metadata from the last LEADERBOARD_TIME_FRAME_DAYS only.
-    Structure: [agent_identifier]/YYYY.MM.DD.jsonl
-    Returns:
-        List of dictionaries with 'agent_identifier' added to each PR metadata.
-        Only includes PRs within the last LEADERBOARD_TIME_FRAME_DAYS.
-    """
-    try:
-        api = HfApi()
-        token = get_hf_token()
-        # Calculate cutoff date for filtering
-        cutoff_date = datetime.now(timezone.utc) - timedelta(days=LEADERBOARD_TIME_FRAME_DAYS)
-        # List all files in the repository
-        files = list_repo_files_with_backoff(api, repo_id=PR_METADATA_REPO, repo_type="dataset")
-        # Filter for files within the time frame: [agent_identifier]/YYYY.MM.DD.jsonl
-        # Parse date from filename and only include files within LEADERBOARD_TIME_FRAME_DAYS
-        relevant_files = []
-        for f in files:
-            if f.endswith('.jsonl'):
-                parts = f.split('/')
-                if len(parts) == 2:  # [agent_identifier]/YYYY.MM.DD.jsonl
-                    filename = parts[1]
-                    try:
-                        # Parse date from filename: YYYY.MM.DD.jsonl
-                        date_part = filename.replace('.jsonl', '')  # Get YYYY.MM.DD
-                        date_components = date_part.split('.')
-                        if len(date_components) == 3:
-                            file_year, file_month, file_day = map(int, date_components)
-                            file_date = datetime(file_year, file_month, file_day, tzinfo=timezone.utc)
-                            # Only include files within the time frame
-                            if file_date >= cutoff_date:
-                                relevant_files.append(f)
-                    except Exception:
-                        # If date parsing fails, skip this file
-                        continue
-        total_months = LEADERBOARD_TIME_FRAME_DAYS // 30
-        print(f"📥 Loading PR metadata from last {total_months} months ({len(relevant_files)} daily files across all agents)...")
-        all_metadata = []
-        for filename in relevant_files:
-            try:
-                # Extract agent_identifier from path (first part)
-                # Format: agent_identifier/YYYY.MM.DD.jsonl
-                parts = filename.split('/')
-                if len(parts) != 2:
-                    print(f"   Warning: Unexpected filename format: {filename}")
-                    continue
-                agent_identifier = parts[0]
-                file_path = hf_hub_download_with_backoff(
-                    repo_id=PR_METADATA_REPO,
-                    filename=filename,
-                    repo_type="dataset",
-                    token=token
-                )
-                day_metadata = load_jsonl(file_path)
-                # Filter individual PRs by created_at date as a double-check
-                for pr_meta in day_metadata:
-                    created_at = pr_meta.get('created_at')
-                    if created_at:
-                        try:
-                            dt = parse_date_string(created_at)
-                            if dt >= cutoff_date:
-                                pr_meta['agent_identifier'] = agent_identifier
-                                all_metadata.append(pr_meta)
-                        except Exception:
-                            # If date parsing fails, skip this PR
-                            continue
-                    else:
-                        # If no created_at, skip this PR
-                        continue
-                print(f"   ✓ Loaded PRs from {filename}")
-            except Exception as e:
-                print(f"   Warning: Could not load {filename}: {str(e)}")
-        print(f"✓ Loaded {len(all_metadata)} total PRs from last {total_months} months")
-        return all_metadata
-    except Exception as e:
-        total_months = LEADERBOARD_TIME_FRAME_DAYS // 30
-        print(f"✗ Error loading PR metadata from last {total_months} months: {str(e)}")
-        return []
-def get_daily_files_last_time_frame(agent_identifier):
-    """
-    Get list of daily file paths for an agent from the configured time frame.
-    Args:
-        agent_identifier: GitHub identifier of the agent
-    Returns:
-        List of file paths in format: [agent_identifier]/YYYY.MM.DD.jsonl
-    """
-    try:
-        api = HfApi()
-        token = get_hf_token()
-        # Calculate date range using configured time frame
-        today = datetime.now(timezone.utc)
-        cutoff_date = today - timedelta(days=LEADERBOARD_TIME_FRAME_DAYS)
-        # List all files in the repository
-        files = list_repo_files_with_backoff(api, repo_id=PR_METADATA_REPO, repo_type="dataset")
-        # Filter for files in this agent's folder
-        agent_pattern = f"{agent_identifier}/"
-        agent_files = [f for f in files if f.startswith(agent_pattern) and f.endswith('.jsonl')]
-        # Filter by date range (extract date from filename)
-        recent_files = []
-        for filename in agent_files:
-            try:
-                # Extract date from filename: YYYY.MM.DD.jsonl
-                parts = filename.split('/')
-                if len(parts) != 2:
-                    continue
-                date_part = parts[1].replace('.jsonl', '')  # Get YYYY.MM.DD
-                date_components = date_part.split('.')
-                if len(date_components) != 3:
-                    continue
-                file_year, file_month, file_day = map(int, date_components)
-                file_date = datetime(file_year, file_month, file_day, tzinfo=timezone.utc)
-                # Include if within configured time frame
-                if cutoff_date <= file_date <= today:
-                    recent_files.append(filename)
-            except Exception:
-                continue
-        return recent_files
-    except Exception as e:
-        print(f"Error getting daily files: {str(e)}")
-        return []
 # =============================================================================
 # HUGGINGFACE DATASET OPERATIONS
 # =============================================================================
@@ -905,13 +182,11 @@ def load_agents_from_hf():
         agents = []
         # List all files in the repository
-        files = list_repo_files_with_backoff(api, repo_id=AGENTS_REPO, repo_type="dataset")
         # Filter for JSON files only
         json_files = [f for f in files if f.endswith('.json')]
-        print(f"Found {len(json_files)} agent files in {AGENTS_REPO}")
         # Download and parse each JSON file
         for json_file in json_files:
             try:
@@ -928,9 +203,11 @@ def load_agents_from_hf():
                     if agent_data.get('status') != 'public':
                         continue
-                    # Extract github_identifier from filename (remove .json extension)
-                    github_identifier = json_file.replace('.json', '')
-                    agent_data['github_identifier'] = github_identifier
                     agents.append(agent_data)
@@ -938,7 +215,7 @@ def load_agents_from_hf():
                 print(f"Warning: Could not load {json_file}: {str(e)}")
                 continue
-        print(f"✓ Loaded {len(agents)} agents from HuggingFace")
         return agents
     except Exception as e:
@@ -954,37 +231,6 @@ def get_hf_token():
     return token
-def load_leaderboard_data_from_hf():
-    """
-    Load pre-computed leaderboard and monthly metrics data from HuggingFace.
-    Returns:
-        Dictionary with 'leaderboard', 'monthly_metrics', and 'last_updated' keys.
-        Returns None if file doesn't exist or error occurs.
-    """
-    try:
-        token = get_hf_token()
-        # Download the swe-pr.json file
-        file_path = hf_hub_download_with_backoff(
-            repo_id=LEADERBOARD_REPO,
-            filename="swe-pr.json",
-            repo_type="dataset",
-            token=token
-        )
-        with open(file_path, 'r') as f:
-            data = json.load(f)
-        print(f"✓ Loaded leaderboard data (last updated: {data.get('last_updated', 'Unknown')})")
-        return data
-    except Exception as e:
-        print(f"⚠️  Could not load leaderboard data from HuggingFace: {str(e)}")
-        print(f"   Falling back to computing from raw PR metadata...")
-        return None
 def upload_with_retry(api, path_or_fileobj, path_in_repo, repo_id, repo_type, token, max_retries=5):
     """
     Upload file to HuggingFace with exponential backoff retry logic.
@@ -1013,18 +259,18 @@ def upload_with_retry(api, path_or_fileobj, path_in_repo, repo_id, repo_type, to
                 token=token
             )
             if attempt > 0:
-                print(f"   ✓ Upload succeeded on attempt {attempt + 1}/{max_retries}")
             return True
         except Exception as e:
             if attempt < max_retries - 1:
                 wait_time = delay + random.uniform(0, 1.0)
-                print(f"   ⚠️ Upload failed (attempt {attempt + 1}/{max_retries}): {str(e)}")
-                print(f"   ⏳ Retrying in {wait_time:.1f} seconds...")
                 time.sleep(wait_time)
                 delay = min(delay * 2, 60.0)  # Exponential backoff, max 60s
             else:
-                print(f"   ✗ Upload failed after {max_retries} attempts: {str(e)}")
                 raise
@@ -1054,7 +300,7 @@ def save_agent_to_hf(data):
                 repo_type="dataset",
                 token=token
             )
-            print(f"✓ Saved agent to HuggingFace: {filename}")
             return True
         finally:
             # Always clean up local file, even if upload fails
@@ -1062,208 +308,52 @@ def save_agent_to_hf(data):
                 os.remove(filename)
     except Exception as e:
-        print(f"✗ Error saving agent: {str(e)}")
         return False
-def save_leaderboard_and_metrics_to_hf():
     """
-    Creates a comprehensive JSON file with both leaderboard stats and monthly metrics.
-    If the file exists, it will be overwritten.
     Returns:
-        bool: True if successful, False otherwise
     """
-    import io
     try:
         token = get_hf_token()
-        if not token:
-            raise Exception("No HuggingFace token found")
-        api = HfApi(token=token)
-        print(f"\n{'='*80}")
-        print(f"📊 Preparing leaderboard and metrics data for upload...")
-        print(f"{'='*80}\n")
-        # Get leaderboard data
-        print("   Constructing leaderboard data...")
-        leaderboard_data = construct_leaderboard_from_metadata()
-        # Get monthly metrics data (all agents, not just top N)
-        print("   Calculating monthly metrics...")
-        monthly_metrics = calculate_monthly_metrics_by_agent(top_n=None)
-        # Combine into a single structure
-        combined_data = {
-            "leaderboard": leaderboard_data,
-            "monthly_metrics": monthly_metrics,
-            "metadata": {
-                "last_updated": datetime.now(timezone.utc).isoformat(),
-                "time_frame_days": LEADERBOARD_TIME_FRAME_DAYS,
-                "total_agents": len(leaderboard_data)
-            }
-        }
-        print(f"   Leaderboard entries: {len(leaderboard_data)}")
-        print(f"   Monthly metrics for: {len(monthly_metrics['agents'])} agents")
-        print(f"   Time frame: {LEADERBOARD_TIME_FRAME_DAYS} days")
-        # Convert to JSON and create file-like object
-        json_content = json.dumps(combined_data, indent=2)
-        file_like_object = io.BytesIO(json_content.encode('utf-8'))
-        # Upload to HuggingFace (will overwrite if exists)
-        print(f"\n🤗 Uploading to {LEADERBOARD_REPO}...")
-        upload_file_with_backoff(
-            api,
-            path_or_fileobj=file_like_object,
-            path_in_repo="swe-pr.json",
             repo_id=LEADERBOARD_REPO,
             repo_type="dataset",
-            token=token,
-            commit_message=f"Update leaderboard data - {datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S')} UTC"
         )
-        print(f"   ✓ Successfully uploaded swe-pr.json")
-        print(f"{'='*80}\n")
-        return True
-    except Exception as e:
-        print(f"   ✗ Error saving leaderboard data: {str(e)}")
-        import traceback
-        traceback.print_exc()
-        return False
-# =============================================================================
-# DATA MANAGEMENT
-# =============================================================================
-def mine_all_agents():
-    """
-    Mine PR metadata for all agents within UPDATE_TIME_FRAME_DAYS and save to HuggingFace.
-    Uses BATCHED BigQuery queries for all agents (efficient approach).
-    """
-    # Load agent metadata from HuggingFace
-    agents = load_agents_from_hf()
-    if not agents:
-        print("No agents found in HuggingFace dataset")
-        return
-    # Extract all identifiers
-    identifiers = [agent['github_identifier'] for agent in agents if agent.get('github_identifier')]
-    if not identifiers:
-        print("No valid agent identifiers found")
-        return
-    print(f"\n{'='*80}")
-    print(f"Starting PR metadata mining for {len(identifiers)} agents")
-    print(f"Time frame: Last {UPDATE_TIME_FRAME_DAYS} days")
-    print(f"Data source: BigQuery + GitHub Archive (BATCHED QUERIES)")
-    print(f"{'='*80}\n")
-    # Initialize BigQuery client
-    try:
-        client = get_bigquery_client()
-    except Exception as e:
-        print(f"✗ Failed to initialize BigQuery client: {str(e)}")
-        return
-    # Define time range: past UPDATE_TIME_FRAME_DAYS (excluding today)
-    current_time = datetime.now(timezone.utc)
-    end_date = current_time.replace(hour=0, minute=0, second=0, microsecond=0)
-    start_date = end_date - timedelta(days=UPDATE_TIME_FRAME_DAYS)
-    try:
-        # Use batched approach for better performance
-        # upload_immediately=True means each batch uploads to HuggingFace right after BigQuery completes
-        all_metadata = fetch_issue_metadata_batched(
-            client, identifiers, start_date, end_date, batch_size=100, upload_immediately=True
-        )
-        # Calculate summary statistics
-        total_prs = sum(len(metadata_list) for metadata_list in all_metadata.values())
-        agents_with_data = sum(1 for metadata_list in all_metadata.values() if metadata_list)
-        print(f"\n{'='*80}")
-        print(f"✅ BigQuery mining and upload complete!")
-        print(f"   Total agents: {len(agents)}")
-        print(f"   Agents with data: {agents_with_data}")
-        print(f"   Total PRs found: {total_prs}")
-        print(f"{'='*80}\n")
     except Exception as e:
-        print(f"✗ Error during BigQuery fetch: {str(e)}")
-        import traceback
-        traceback.print_exc()
-        return
-    # After mining is complete, save leaderboard and metrics to HuggingFace
-    print(f"📤 Uploading leaderboard and metrics data...")
-    if save_leaderboard_and_metrics_to_hf():
-        print(f"✓ Leaderboard and metrics successfully uploaded to {LEADERBOARD_REPO}")
-    else:
-        print(f"⚠️ Failed to upload leaderboard and metrics data")
-def construct_leaderboard_from_metadata():
-    """
-    Construct leaderboard from stored PR metadata instead of fetching all PRs.
-    Much more memory-efficient and faster.
-    Returns dictionary of agent stats.
-    """
-    print("📊 Constructing leaderboard from PR metadata...")
-    # Load agents
-    agents = load_agents_from_hf()
-    if not agents:
-        print("No agents found")
-        return {}
-    # Load all PR metadata
-    all_metadata = load_pr_metadata()
-    cache_dict = {}
-    for agent in agents:
-        identifier = agent.get('github_identifier')
-        agent_name = agent.get('name', 'Unknown')
-        # Filter metadata for this agent
-        bot_metadata = [pr for pr in all_metadata if pr.get('agent_identifier') == identifier]
-        # Calculate stats
-        stats = calculate_pr_stats_from_metadata(bot_metadata)
-        cache_dict[identifier] = {
-            'name': agent_name,
-            'website': agent.get('website', 'Unknown'),
-            'github_identifier': identifier,
-            **stats
-        }
-    return cache_dict
 # =============================================================================
 # UI FUNCTIONS
 # =============================================================================
-def generate_color(index, total):
-    """Generate distinct colors using HSL color space for better distribution"""
-    hue = (index * 360 / total) % 360
-    saturation = 70 + (index % 3) * 10  # Vary saturation slightly
-    lightness = 45 + (index % 2) * 10   # Vary lightness slightly
-    return f'hsl({hue}, {saturation}%, {lightness}%)'
 def create_monthly_metrics_plot(top_n=5):
     """
     Create a Plotly figure with dual y-axes showing:
-    - Left y-axis: Acceptance rate (%) as line curves
     - Right y-axis: Total PRs created as bar charts
     Each agent gets a unique color for both their line and bars.
@@ -1271,37 +361,47 @@ def create_monthly_metrics_plot(top_n=5):
     Args:
         top_n: Number of top agents to show (default: 5)
     """
-    global _LEADERBOARD_CACHE
-    # Load from cache if available
-    if _LEADERBOARD_CACHE is not None:
-        metrics = _LEADERBOARD_CACHE.get('monthly_metrics', {})
-        # Apply top_n filter if specified
-        if top_n is not None and top_n > 0 and metrics.get('agents'):
-            agents_list = metrics['agents']
-            data = metrics['data']
-            # Calculate total PRs for each agent across all months
-            agent_totals = []
-            for agent_name in agents_list:
-                total_pr_count = sum(data[agent_name]['total_prs'])
-                agent_totals.append((agent_name, total_pr_count))
-            # Sort by total PRs (descending) and take top N
-            agent_totals.sort(key=lambda x: x[1], reverse=True)
-            top_agents = [agent_name for agent_name, _ in agent_totals[:top_n]]
-            # Filter result_data to only include top agents
-            filtered_data = {agent: data[agent] for agent in top_agents if agent in data}
-            metrics = {
-                'agents': top_agents,
-                'months': metrics['months'],
-                'data': filtered_data
-            }
-    else:
-        # Fallback: compute from PR metadata
-        metrics = calculate_monthly_metrics_by_agent(top_n=top_n)
     if not metrics['agents'] or not metrics['months']:
         # Return an empty figure with a message
@@ -1322,11 +422,19 @@ def create_monthly_metrics_plot(top_n=5):
     # Create figure with secondary y-axis
     fig = make_subplots(specs=[[{"secondary_y": True}]])
     agents = metrics['agents']
     months = metrics['months']
     data = metrics['data']
-    # Generate colors for all agents using HSL
     agent_colors = {agent: generate_color(idx, len(agents)) for idx, agent in enumerate(agents)}
     # Add traces for each agent
@@ -1348,10 +456,11 @@ def create_monthly_metrics_plot(top_n=5):
                     name=agent_name,
                     mode='lines+markers',
                     line=dict(color=color, width=2),
-                    marker=dict(size=6),
                     legendgroup=agent_name,
-                    showlegend=True,
-                    hovertemplate='<b>%{fullData.name}</b><br>' +
                                  'Acceptance Rate: %{y:.2f}%<br>' +
                                  '<extra></extra>'
                 ),
@@ -1375,8 +484,9 @@ def create_monthly_metrics_plot(top_n=5):
                     name=agent_name,
                     marker=dict(color=color, opacity=0.6),
                     legendgroup=agent_name,
-                    showlegend=False,  # Don't show in legend (already shown for line)
-                    hovertemplate='<b>%{fullData.name}</b><br>' +
                                  'Total PRs: %{y}<br>' +
                                  '<extra></extra>',
                     offsetgroup=agent_name  # Group bars by agent for proper spacing
@@ -1386,23 +496,26 @@ def create_monthly_metrics_plot(top_n=5):
     # Update axes labels
     fig.update_xaxes(title_text=None)
-    fig.update_yaxes(title_text="<b>Acceptance Rate (%)</b>", secondary_y=False)
     fig.update_yaxes(title_text="<b>Total PRs</b>", secondary_y=True)
     # Update layout
     fig.update_layout(
         title=None,
-        hovermode='closest',
         barmode='group',
         height=600,
-        legend=dict(
-            orientation="h",
-            yanchor="bottom",
-            y=1.02,
-            xanchor="right",
-            x=1
-        ),
-        margin=dict(l=50, r=50, t=100, b=50)
     )
     return fig
@@ -1410,36 +523,51 @@ def create_monthly_metrics_plot(top_n=5):
 def get_leaderboard_dataframe():
     """
-    Load leaderboard data from cached JSON and convert to pandas DataFrame for display.
-    Falls back to computing from PR metadata if cache is not available.
     Returns formatted DataFrame sorted by total PRs.
     """
-    global _LEADERBOARD_CACHE
-    # Load from cache if available
-    if _LEADERBOARD_CACHE is not None:
-        cache_dict = _LEADERBOARD_CACHE.get('leaderboard', {})
-    else:
-        # Fallback: compute from PR metadata
-        cache_dict = construct_leaderboard_from_metadata()
     if not cache_dict:
         # Return empty DataFrame with correct columns if no data
         column_names = [col[0] for col in LEADERBOARD_COLUMNS]
         return pd.DataFrame(columns=column_names)
     rows = []
     for identifier, data in cache_dict.items():
         # Filter out agents with zero total PRs
-        if data.get('total_prs', 0) > 0:
-            # Only include display-relevant fields
-            rows.append([
-                data.get('name', 'Unknown'),
-                data.get('website', 'Unknown'),
-                data.get('total_prs', 0),
-                data.get('merged_prs', 0),
-                data.get('acceptance_rate', 0.0),
-            ])
     # Create DataFrame
     column_names = [col[0] for col in LEADERBOARD_COLUMNS]
@@ -1455,111 +583,125 @@ def get_leaderboard_dataframe():
     if "Total PRs" in df.columns and not df.empty:
         df = df.sort_values(by="Total PRs", ascending=False).reset_index(drop=True)
     return df
-def submit_agent(identifier, agent_name, organization, description, website):
     """
     Submit a new agent to the leaderboard.
     Validates input and saves submission.
-    PR data will be populated by the monthly mining task.
     """
     # Validate required fields
     if not identifier or not identifier.strip():
-        return "❌ GitHub identifier is required", get_leaderboard_dataframe(), create_monthly_metrics_plot()
     if not agent_name or not agent_name.strip():
-        return "❌ Agent name is required", get_leaderboard_dataframe(), create_monthly_metrics_plot()
     if not organization or not organization.strip():
-        return "❌ Organization name is required", get_leaderboard_dataframe(), create_monthly_metrics_plot()
     if not website or not website.strip():
-        return "❌ Website URL is required", get_leaderboard_dataframe(), create_monthly_metrics_plot()
     # Clean inputs
     identifier = identifier.strip()
     agent_name = agent_name.strip()
     organization = organization.strip()
-    description = description.strip()
     website = website.strip()
     # Validate GitHub identifier
     is_valid, message = validate_github_username(identifier)
     if not is_valid:
-        return f"❌ {message}", get_leaderboard_dataframe(), create_monthly_metrics_plot()
     # Check for duplicates by loading agents from HuggingFace
     agents = load_agents_from_hf()
     if agents:
         existing_names = {agent['github_identifier'] for agent in agents}
         if identifier in existing_names:
-            return f"⚠️ Agent with identifier '{identifier}' already exists", get_leaderboard_dataframe(), create_monthly_metrics_plot()
     # Create submission
     submission = {
         'name': agent_name,
         'organization': organization,
         'github_identifier': identifier,
-        'description': description,
         'website': website,
     }
     # Save to HuggingFace
     if not save_agent_to_hf(submission):
-        return "❌ Failed to save submission", get_leaderboard_dataframe(), create_monthly_metrics_plot()
-    success_msg = f"✅ Successfully submitted {agent_name}!\n\nPR data will be populated by the monthly mining task (runs every 1st of the month at 12:00 AM UTC)."
-    return success_msg, get_leaderboard_dataframe(), create_monthly_metrics_plot()
 # =============================================================================
 # GRADIO APPLICATION
 # =============================================================================
-print(f"\n🚀 Starting SWE Agent PR Leaderboard")
-print(f"   Leaderboard time frame: {LEADERBOARD_TIME_FRAME_DAYS} days ({LEADERBOARD_TIME_FRAME_DAYS // 30} months)")
-print(f"   Mining update frequency: Every {UPDATE_TIME_FRAME_DAYS} days\n")
-# Start APScheduler for monthly PR mining at 12:00 AM UTC every 1st of the month
 scheduler = BackgroundScheduler(timezone="UTC")
 scheduler.add_job(
-    mine_all_agents,
-    trigger=CronTrigger(day=1, hour=0, minute=0),  # 12:00 AM UTC every 1st of the month
-    id='monthly_pr_mining',
-    name='Monthly PR Mining',
     replace_existing=True
 )
 scheduler.start()
 print(f"\n{'='*80}")
-print(f"✓ Scheduler initialized successfully")
-print(f"⛏️  Mining schedule: Every 1st of the month at 12:00 AM UTC")
-print(f"📥 On startup: Only loads cached data from HuggingFace (no mining)")
 print(f"{'='*80}\n")
-# Load leaderboard data from HuggingFace at startup
-print(f"📥 Loading leaderboard data from HuggingFace...")
-_LEADERBOARD_CACHE = load_leaderboard_data_from_hf()
-if _LEADERBOARD_CACHE is None:
-    print(f"⚠️  No cached leaderboard data found - will compute from raw PR metadata")
-else:
-    print(f"✓ Leaderboard cache loaded successfully")
-print()
 # Create Gradio interface
 with gr.Blocks(title="SWE Agent PR Leaderboard", theme=gr.themes.Soft()) as app:
-    total_months = LEADERBOARD_TIME_FRAME_DAYS // 30
-    gr.Markdown("# 🏆 SWE Agent PR Leaderboard")
     gr.Markdown(f"Track and compare GitHub pull request statistics for SWE agents")
     with gr.Tabs():
         # Leaderboard Tab
-        with gr.Tab("📊 Leaderboard"):
-            gr.Markdown(f"*All statistics are based on PRs from the last {total_months} months*")
             leaderboard_table = Leaderboard(
-                value=get_leaderboard_dataframe(),
                 datatype=LEADERBOARD_COLUMNS,
                 search_columns=["Agent Name", "Website"],
                 filter_columns=[
@@ -1574,16 +716,30 @@ with gr.Blocks(title="SWE Agent PR Leaderboard", theme=gr.themes.Soft()) as app:
                 ]
             )
-            gr.Markdown("### Monthly Metrics - Top 5 Agents")
-            gr.Markdown("Track acceptance rates and PR activity over time for the most active agents")
-            monthly_plot = gr.Plot(
-                value=create_monthly_metrics_plot(),
-                label="Monthly PR Metrics"
             )
         # Submit Agent Tab
-        with gr.Tab("➕ Submit Agent"):
             gr.Markdown("### Submit Your Agent")
             gr.Markdown("Fill in the details below to add your agent to the leaderboard.")
@@ -1592,7 +748,7 @@ with gr.Blocks(title="SWE Agent PR Leaderboard", theme=gr.themes.Soft()) as app:
                 with gr.Column():
                     github_input = gr.Textbox(
                         label="GitHub Identifier*",
-                        placeholder="Your agent username (e.g., my-agent-bot)"
                     )
                     name_input = gr.Textbox(
                         label="Agent Name*",
@@ -1604,11 +760,6 @@ with gr.Blocks(title="SWE Agent PR Leaderboard", theme=gr.themes.Soft()) as app:
                         label="Organization*",
                         placeholder="Your organization or team name"
                     )
-                    description_input = gr.Textbox(
-                        label="Description",
-                        placeholder="Brief description of your agent",
-                        lines=3
-                    )
                     website_input = gr.Textbox(
                         label="Website*",
                         placeholder="https://your-agent-website.com"
@@ -1626,8 +777,8 @@ with gr.Blocks(title="SWE Agent PR Leaderboard", theme=gr.themes.Soft()) as app:
             # Event handler
             submit_button.click(
                 fn=submit_agent,
-                inputs=[github_input, name_input, organization_input, description_input, website_input],
-                outputs=[submission_status, leaderboard_table, monthly_plot]
             )

 import json
 import os
 import time
 import requests
 from huggingface_hub import HfApi, hf_hub_download
 from huggingface_hub.errors import HfHubHTTPError
+import backoff
 from dotenv import load_dotenv
 import pandas as pd
 import random
 import plotly.graph_objects as go
 from plotly.subplots import make_subplots
 from apscheduler.schedulers.background import BackgroundScheduler
 from apscheduler.triggers.cron import CronTrigger
 # Load environment variables
 load_dotenv()
 # =============================================================================
 AGENTS_REPO = "SWE-Arena/bot_metadata"  # HuggingFace dataset for agent metadata
+LEADERBOARD_REPO = "SWE-Arena/leaderboard_metadata"  # HuggingFace dataset for leaderboard data
 LEADERBOARD_COLUMNS = [
     ("Agent Name", "string"),
     ("Acceptance Rate (%)", "number"),
 ]
 # =============================================================================
+# HUGGINGFACE API WRAPPERS WITH BACKOFF
 # =============================================================================
 def is_rate_limit_error(e):
     return False
 @backoff.on_exception(
     backoff.expo,
     HfHubHTTPError,
     max_tries=8,
+    base=300,
+    max_value=3600,
+    giveup=lambda e: not is_rate_limit_error(e),
+    on_backoff=lambda details: print(
+        f"Rate limited. Retrying in {details['wait']/60:.1f} minutes ({details['wait']:.0f}s) - attempt {details['tries']}/8..."
+    )
 )
 def list_repo_files_with_backoff(api, **kwargs):
+    """Wrapper for api.list_repo_files() with exponential backoff for rate limits."""
     return api.list_repo_files(**kwargs)
 @backoff.on_exception(
     backoff.expo,
     HfHubHTTPError,
     max_tries=8,
+    base=300,
+    max_value=3600,
+    giveup=lambda e: not is_rate_limit_error(e),
+    on_backoff=lambda details: print(
+        f"Rate limited. Retrying in {details['wait']/60:.1f} minutes ({details['wait']:.0f}s) - attempt {details['tries']}/8..."
+    )
 )
 def hf_hub_download_with_backoff(**kwargs):
+    """Wrapper for hf_hub_download() with exponential backoff for rate limits."""
     return hf_hub_download(**kwargs)
 # =============================================================================
+# GITHUB API OPERATIONS
 # =============================================================================
+def request_with_backoff(method, url, *, headers=None, params=None, json_body=None, data=None, max_retries=10, timeout=30):
     """
+    Perform an HTTP request with exponential backoff and jitter for GitHub API.
+    Retries on 403/429 (rate limits), 5xx server errors, and transient network exceptions.
+    Returns the final requests.Response on success or non-retryable status, or None after exhausting retries.
     """
+    delay = 1.0
+    for attempt in range(max_retries):
         try:
+            resp = requests.request(
+                method,
+                url,
+                headers=headers or {},
+                params=params,
+                json=json_body,
+                data=data,
+                timeout=timeout
             )
+            status = resp.status_code
+            # Success
+            if 200 <= status < 300:
+                return resp
+            # Rate limits or server errors -> retry with backoff
+            if status in (403, 429) or 500 <= status < 600:
+                wait = None
+                # Prefer Retry-After when present
+                retry_after = resp.headers.get('Retry-After') or resp.headers.get('retry-after')
+                if retry_after:
+                    try:
+                        wait = float(retry_after)
+                    except Exception:
+                        wait = None
+                # Fallback to X-RateLimit-Reset when 403/429
+                if wait is None and status in (403, 429):
+                    reset_hdr = resp.headers.get('X-RateLimit-Reset') or resp.headers.get('x-ratelimit-reset')
+                    if reset_hdr:
+                        try:
+                            reset_timestamp = int(float(reset_hdr))
+                            wait = max(reset_timestamp - time.time() + 2, 1)
+                        except Exception:
+                            wait = None
+                # Final fallback: exponential backoff with jitter
+                if wait is None:
+                    wait = delay + random.uniform(0, 0.5)
+                # Cap individual wait to avoid extreme sleeps
+                wait = max(1.0, min(wait, 120.0))
+                print(f"GitHub API {status}. Backing off {wait:.1f}s (attempt {attempt + 1}/{max_retries})...")
+                time.sleep(wait)
+                delay = min(delay * 2, 60.0)
+                continue
+            # Non-retryable error; return response for caller to handle
+            return resp
+        except requests.RequestException as e:
+            # Network error -> retry with backoff
+            wait = delay + random.uniform(0, 0.5)
+            wait = max(1.0, min(wait, 60.0))
+            print(f"Request error: {e}. Retrying in {wait:.1f}s (attempt {attempt + 1}/{max_retries})...")
+            time.sleep(wait)
+            delay = min(delay * 2, 60.0)
+    print(f"Exceeded max retries for {url}")
+    return None
 def validate_github_username(identifier):
+    """Verify that a GitHub identifier exists with backoff-aware requests."""
     try:
         url = f'https://api.github.com/users/{identifier}'
+        response = request_with_backoff('GET', url, max_retries=1)
+        if response is None:
+            return False, "Validation error: network/rate limit exhausted"
         if response.status_code == 200:
             return True, "Username is valid"
         elif response.status_code == 404:
         return False, f"Validation error: {str(e)}"
 # =============================================================================
 # HUGGINGFACE DATASET OPERATIONS
 # =============================================================================
         agents = []
         # List all files in the repository
+        files = list_repo_files_with_backoff(api=api, repo_id=AGENTS_REPO, repo_type="dataset")
         # Filter for JSON files only
         json_files = [f for f in files if f.endswith('.json')]
         # Download and parse each JSON file
         for json_file in json_files:
             try:
                     if agent_data.get('status') != 'public':
                         continue
+                    # Extract github_identifier from filename (e.g., "agent[bot].json" -> "agent[bot]")
+                    filename_identifier = json_file.replace('.json', '')
+                    # Add or override github_identifier to match filename
+                    agent_data['github_identifier'] = filename_identifier
                     agents.append(agent_data)
                 print(f"Warning: Could not load {json_file}: {str(e)}")
                 continue
+        print(f"Loaded {len(agents)} agents from HuggingFace")
         return agents
     except Exception as e:
     return token
 def upload_with_retry(api, path_or_fileobj, path_in_repo, repo_id, repo_type, token, max_retries=5):
     """
     Upload file to HuggingFace with exponential backoff retry logic.
                 token=token
             )
             if attempt > 0:
+                print(f"   Upload succeeded on attempt {attempt + 1}/{max_retries}")
             return True
         except Exception as e:
             if attempt < max_retries - 1:
                 wait_time = delay + random.uniform(0, 1.0)
+                print(f"   Upload failed (attempt {attempt + 1}/{max_retries}): {str(e)}")
+                print(f"   Retrying in {wait_time:.1f} seconds...")
                 time.sleep(wait_time)
                 delay = min(delay * 2, 60.0)  # Exponential backoff, max 60s
             else:
+                print(f"   Upload failed after {max_retries} attempts: {str(e)}")
                 raise
                 repo_type="dataset",
                 token=token
             )
+            print(f"Saved agent to HuggingFace: {filename}")
             return True
         finally:
             # Always clean up local file, even if upload fails
                 os.remove(filename)
     except Exception as e:
+        print(f"Error saving agent: {str(e)}")
         return False
+def load_leaderboard_data_from_hf():
     """
+    Load leaderboard data and monthly metrics from HuggingFace dataset.
     Returns:
+        dict: Dictionary with 'leaderboard', 'monthly_metrics', and 'metadata' keys
+              Returns None if file doesn't exist or error occurs
     """
     try:
         token = get_hf_token()
+        filename = "swe-pr.json"
+        # Download file
+        file_path = hf_hub_download_with_backoff(
             repo_id=LEADERBOARD_REPO,
+            filename=filename,
             repo_type="dataset",
+            token=token
         )
+        # Load JSON data
+        with open(file_path, 'r') as f:
+            data = json.load(f)
+        last_updated = data.get('metadata', {}).get('last_updated', 'Unknown')
+        print(f"Loaded leaderboard data from HuggingFace (last updated: {last_updated})")
+        return data
     except Exception as e:
+        print(f"Could not load leaderboard data from HuggingFace: {str(e)}")
+        return None
 # =============================================================================
 # UI FUNCTIONS
 # =============================================================================
 def create_monthly_metrics_plot(top_n=5):
     """
     Create a Plotly figure with dual y-axes showing:
+    - Left y-axis: Acceptance Rate (%) as line curves
     - Right y-axis: Total PRs created as bar charts
     Each agent gets a unique color for both their line and bars.
     Args:
         top_n: Number of top agents to show (default: 5)
     """
+    # Load from saved dataset
+    saved_data = load_leaderboard_data_from_hf()
+    if not saved_data or 'monthly_metrics' not in saved_data:
+        # Return an empty figure with a message
+        fig = go.Figure()
+        fig.add_annotation(
+            text="No data available for visualization",
+            xref="paper", yref="paper",
+            x=0.5, y=0.5, showarrow=False,
+            font=dict(size=16)
+        )
+        fig.update_layout(
+            title=None,
+            xaxis_title=None,
+            height=500
+        )
+        return fig
+    metrics = saved_data['monthly_metrics']
+    print(f"Loaded monthly metrics from saved dataset")
+    # Apply top_n filter if specified
+    if top_n is not None and top_n > 0 and metrics.get('agents'):
+        # Calculate total PRs for each agent
+        agent_totals = []
+        for agent_name in metrics['agents']:
+            agent_data = metrics['data'].get(agent_name, {})
+            total_prs = sum(agent_data.get('total_prs', []))
+            agent_totals.append((agent_name, total_prs))
+        # Sort by total PRs and take top N
+        agent_totals.sort(key=lambda x: x[1], reverse=True)
+        top_agents = [agent_name for agent_name, _ in agent_totals[:top_n]]
+        # Filter metrics to only include top agents
+        metrics = {
+            'agents': top_agents,
+            'months': metrics['months'],
+            'data': {agent: metrics['data'][agent] for agent in top_agents if agent in metrics['data']}
+        }
     if not metrics['agents'] or not metrics['months']:
         # Return an empty figure with a message
     # Create figure with secondary y-axis
     fig = make_subplots(specs=[[{"secondary_y": True}]])
+    # Generate unique colors for many agents using HSL color space
+    def generate_color(index, total):
+        """Generate distinct colors using HSL color space for better distribution"""
+        hue = (index * 360 / total) % 360
+        saturation = 70 + (index % 3) * 10  # Vary saturation slightly
+        lightness = 45 + (index % 2) * 10   # Vary lightness slightly
+        return f'hsl({hue}, {saturation}%, {lightness}%)'
     agents = metrics['agents']
     months = metrics['months']
     data = metrics['data']
+    # Generate colors for all agents
     agent_colors = {agent: generate_color(idx, len(agents)) for idx, agent in enumerate(agents)}
     # Add traces for each agent
                     name=agent_name,
                     mode='lines+markers',
                     line=dict(color=color, width=2),
+                    marker=dict(size=8),
                     legendgroup=agent_name,
+                    showlegend=(top_n is not None and top_n <= 10),  # Show legend for top N agents
+                    hovertemplate='<b>Agent: %{fullData.name}</b><br>' +
+                                 'Month: %{x}<br>' +
                                  'Acceptance Rate: %{y:.2f}%<br>' +
                                  '<extra></extra>'
                 ),
                     name=agent_name,
                     marker=dict(color=color, opacity=0.6),
                     legendgroup=agent_name,
+                    showlegend=False,  # Hide duplicate legend entry (already shown in Scatter)
+                    hovertemplate='<b>Agent: %{fullData.name}</b><br>' +
+                                 'Month: %{x}<br>' +
                                  'Total PRs: %{y}<br>' +
                                  '<extra></extra>',
                     offsetgroup=agent_name  # Group bars by agent for proper spacing
     # Update axes labels
     fig.update_xaxes(title_text=None)
+    fig.update_yaxes(
+        title_text="<b>Acceptance Rate (%)</b>",
+        range=[0, 100],
+        secondary_y=False,
+        showticklabels=True,
+        tickmode='linear',
+        dtick=10,
+        showgrid=True
+    )
     fig.update_yaxes(title_text="<b>Total PRs</b>", secondary_y=True)
     # Update layout
+    show_legend = (top_n is not None and top_n <= 10)
     fig.update_layout(
         title=None,
+        hovermode='closest',  # Show individual agent info on hover
         barmode='group',
         height=600,
+        showlegend=show_legend,
+        margin=dict(l=50, r=150 if show_legend else 50, t=50, b=50)  # More right margin when legend is shown
     )
     return fig
 def get_leaderboard_dataframe():
     """
+    Load leaderboard from saved dataset and convert to pandas DataFrame for display.
     Returns formatted DataFrame sorted by total PRs.
     """
+    # Load from saved dataset
+    saved_data = load_leaderboard_data_from_hf()
+    if not saved_data or 'leaderboard' not in saved_data:
+        print(f"No leaderboard data available")
+        # Return empty DataFrame with correct columns if no data
+        column_names = [col[0] for col in LEADERBOARD_COLUMNS]
+        return pd.DataFrame(columns=column_names)
+    cache_dict = saved_data['leaderboard']
+    last_updated = saved_data.get('metadata', {}).get('last_updated', 'Unknown')
+    print(f"Loaded leaderboard from saved dataset (last updated: {last_updated})")
+    print(f"Cache dict size: {len(cache_dict)}")
     if not cache_dict:
+        print("WARNING: cache_dict is empty!")
         # Return empty DataFrame with correct columns if no data
         column_names = [col[0] for col in LEADERBOARD_COLUMNS]
         return pd.DataFrame(columns=column_names)
     rows = []
+    filtered_count = 0
     for identifier, data in cache_dict.items():
+        total_prs = data.get('total_prs', 0)
+        print(f"   Agent '{identifier}': {total_prs} PRs")
         # Filter out agents with zero total PRs
+        if total_prs == 0:
+            filtered_count += 1
+            continue
+        # Only include display-relevant fields
+        rows.append([
+            data.get('name', 'Unknown'),
+            data.get('website', 'N/A'),
+            total_prs,
+            data.get('merged_prs', 0),
+            data.get('acceptance_rate', 0.0),
+        ])
+    print(f"Filtered out {filtered_count} agents with 0 PRs")
+    print(f"Leaderboard will show {len(rows)} agents")
     # Create DataFrame
     column_names = [col[0] for col in LEADERBOARD_COLUMNS]
     if "Total PRs" in df.columns and not df.empty:
         df = df.sort_values(by="Total PRs", ascending=False).reset_index(drop=True)
+    print(f"Final DataFrame shape: {df.shape}")
+    print("="*60 + "\n")
     return df
+def submit_agent(identifier, agent_name, organization, website):
     """
     Submit a new agent to the leaderboard.
     Validates input and saves submission.
     """
     # Validate required fields
     if not identifier or not identifier.strip():
+        return "ERROR: GitHub identifier is required", get_leaderboard_dataframe()
     if not agent_name or not agent_name.strip():
+        return "ERROR: Agent name is required", get_leaderboard_dataframe()
     if not organization or not organization.strip():
+        return "ERROR: Organization name is required", get_leaderboard_dataframe()
     if not website or not website.strip():
+        return "ERROR: Website URL is required", get_leaderboard_dataframe()
     # Clean inputs
     identifier = identifier.strip()
     agent_name = agent_name.strip()
     organization = organization.strip()
     website = website.strip()
     # Validate GitHub identifier
     is_valid, message = validate_github_username(identifier)
     if not is_valid:
+        return f"ERROR: {message}", get_leaderboard_dataframe()
     # Check for duplicates by loading agents from HuggingFace
     agents = load_agents_from_hf()
     if agents:
         existing_names = {agent['github_identifier'] for agent in agents}
         if identifier in existing_names:
+            return f"WARNING: Agent with identifier '{identifier}' already exists", get_leaderboard_dataframe()
     # Create submission
     submission = {
         'name': agent_name,
         'organization': organization,
         'github_identifier': identifier,
         'website': website,
+        'status': 'public'
     }
     # Save to HuggingFace
     if not save_agent_to_hf(submission):
+        return "ERROR: Failed to save submission", get_leaderboard_dataframe()
+    # Return success message - data will be populated by backend updates
+    return f"SUCCESS: Successfully submitted {agent_name}! PR data will be populated by the backend system.", get_leaderboard_dataframe()
+# =============================================================================
+# DATA RELOAD FUNCTION
+# =============================================================================
+def reload_leaderboard_data():
+    """
+    Reload leaderboard data from HuggingFace.
+    This function is called by the scheduler on a daily basis.
+    """
+    print(f"\n{'='*80}")
+    print(f"Reloading leaderboard data from HuggingFace...")
+    print(f"{'='*80}\n")
+    try:
+        data = load_leaderboard_data_from_hf()
+        if data:
+            print(f"Successfully reloaded leaderboard data")
+            print(f"   Last updated: {data.get('metadata', {}).get('last_updated', 'Unknown')}")
+            print(f"   Agents: {len(data.get('leaderboard', {}))}")
+        else:
+            print(f"No data available")
+    except Exception as e:
+        print(f"Error reloading leaderboard data: {str(e)}")
+    print(f"{'='*80}\n")
 # =============================================================================
 # GRADIO APPLICATION
 # =============================================================================
+print(f"\nStarting SWE Agent PR Leaderboard")
+print(f"   Data source: {LEADERBOARD_REPO}")
+print(f"   Reload frequency: Daily at 12:00 AM UTC\n")
+# Start APScheduler for daily data reload at 12:00 AM UTC
 scheduler = BackgroundScheduler(timezone="UTC")
 scheduler.add_job(
+    reload_leaderboard_data,
+    trigger=CronTrigger(hour=0, minute=0),  # 12:00 AM UTC daily
+    id='daily_data_reload',
+    name='Daily Data Reload',
     replace_existing=True
 )
 scheduler.start()
 print(f"\n{'='*80}")
+print(f"Scheduler initialized successfully")
+print(f"Reload schedule: Daily at 12:00 AM UTC")
+print(f"On startup: Loads cached data from HuggingFace on demand")
 print(f"{'='*80}\n")
 # Create Gradio interface
 with gr.Blocks(title="SWE Agent PR Leaderboard", theme=gr.themes.Soft()) as app:
+    gr.Markdown("# SWE Agent PR Leaderboard")
     gr.Markdown(f"Track and compare GitHub pull request statistics for SWE agents")
     with gr.Tabs():
         # Leaderboard Tab
+        with gr.Tab("Leaderboard"):
+            gr.Markdown("*Statistics are based on agent PR activity tracked by the system*")
             leaderboard_table = Leaderboard(
+                value=pd.DataFrame(columns=[col[0] for col in LEADERBOARD_COLUMNS]),  # Empty initially
                 datatype=LEADERBOARD_COLUMNS,
                 search_columns=["Agent Name", "Website"],
                 filter_columns=[
                 ]
             )
+            # Load leaderboard data when app starts
+            app.load(
+                fn=get_leaderboard_dataframe,
+                inputs=[],
+                outputs=[leaderboard_table]
+            )
+            # Monthly Metrics Section
+            gr.Markdown("---")  # Divider
+            gr.Markdown("### Monthly Performance - Top 5 Agents")
+            gr.Markdown("*Shows acceptance rate trends and PR volumes for the most active agents*")
+            monthly_metrics_plot = gr.Plot(label="Monthly Metrics")
+            # Load monthly metrics when app starts
+            app.load(
+                fn=lambda: create_monthly_metrics_plot(),
+                inputs=[],
+                outputs=[monthly_metrics_plot]
             )
         # Submit Agent Tab
+        with gr.Tab("Submit Agent"):
             gr.Markdown("### Submit Your Agent")
             gr.Markdown("Fill in the details below to add your agent to the leaderboard.")
                 with gr.Column():
                     github_input = gr.Textbox(
                         label="GitHub Identifier*",
+                        placeholder="Your agent username (e.g., my-agent[bot])"
                     )
                     name_input = gr.Textbox(
                         label="Agent Name*",
                         label="Organization*",
                         placeholder="Your organization or team name"
                     )
                     website_input = gr.Textbox(
                         label="Website*",
                         placeholder="https://your-agent-website.com"
             # Event handler
             submit_button.click(
                 fn=submit_agent,
+                inputs=[github_input, name_input, organization_input, website_input],
+                outputs=[submission_status, leaderboard_table]
             )

docker-compose.yml ADDED Viewed

	@@ -0,0 +1,21 @@

+services:
+  msr-miner:
+    build:
+      context: .
+      dockerfile: Dockerfile
+    container_name: gharchive-miner
+    restart: unless-stopped
+    env_file:
+      - .env
+    volumes:
+      # Mount entire workspace for live code updates
+      - .:/app
+      # Mount gharchive workspace for data storage
+      - ../gharchive:/gharchive:ro
+    environment:
+      - PYTHONUNBUFFERED=1
+    logging:
+      driver: "json-file"
+      options:
+        max-size: "10m"
+        max-file: "3"

msr.py CHANGED Viewed

@@ -1,18 +1,25 @@
 """
 Minimalist PR Metadata Mining Script
-Mines PR metadata from GitHub Archive via BigQuery and saves to HuggingFace dataset.
 """
 import json
 import os
 import tempfile
 from datetime import datetime, timezone, timedelta
 from collections import defaultdict
 from huggingface_hub import HfApi, hf_hub_download
 from huggingface_hub.errors import HfHubHTTPError
 from dotenv import load_dotenv
-from google.cloud import bigquery
 import backoff
 # Load environment variables
 load_dotenv()
@@ -23,8 +30,27 @@ load_dotenv()
 AGENTS_REPO = "SWE-Arena/bot_metadata"
 PR_METADATA_REPO = "SWE-Arena/pr_metadata"
-LEADERBOARD_REPO = "SWE-Arena/leaderboard_metadata"  # For storing computed leaderboard data
-LEADERBOARD_TIME_FRAME_DAYS = 180  # Time frame for mining new PRs
 # =============================================================================
 # UTILITY FUNCTIONS
@@ -54,246 +80,329 @@ def save_jsonl(filename, data):
             f.write(json.dumps(item) + '\n')
-def get_bigquery_client():
     """
-    Initialize BigQuery client using credentials from environment variable.
-    Expects GOOGLE_APPLICATION_CREDENTIALS_JSON environment variable containing
-    the service account JSON credentials as a string.
     """
-    # Get the JSON content from environment variable
-    creds_json = os.environ.get('GOOGLE_APPLICATION_CREDENTIALS_JSON')
-    if creds_json:
-        # Create a temporary file to store credentials
-        with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.json') as temp_file:
-            temp_file.write(creds_json)
-            temp_path = temp_file.name
-        # Set environment variable to point to temp file
-        os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = temp_path
-        # Initialize BigQuery client
-        client = bigquery.Client()
-        # Clean up temp file
-        os.unlink(temp_path)
-        return client
-    else:
-        raise ValueError("GOOGLE_APPLICATION_CREDENTIALS_JSON not found in environment")
-def generate_table_union_statements(start_date, end_date):
     """
-    Generate UNION ALL statements for githubarchive.month tables in date range.
     Args:
-        start_date: Start datetime
-        end_date: End datetime
     Returns:
-        String with UNION ALL SELECT statements for all tables in range
     """
-    table_names = []
-    # Start from the beginning of start_date's month
-    current_date = start_date.replace(day=1)
-    end_month = end_date.replace(day=1)
-    while current_date <= end_month:
-        table_name = f"`githubarchive.month.{current_date.strftime('%Y%m')}`"
-        table_names.append(table_name)
-        # Move to next month
-        if current_date.month == 12:
-            current_date = current_date.replace(year=current_date.year + 1, month=1)
-        else:
-            current_date = current_date.replace(month=current_date.month + 1)
-    # Create UNION ALL chain
-    union_parts = [f"SELECT * FROM {table}" for table in table_names]
-    return " UNION ALL ".join(union_parts)
-def get_hf_token():
-    """Get HuggingFace token from environment variables."""
-    token = os.getenv('HF_TOKEN')
-    if not token:
-        print("Warning: HF_TOKEN not found in environment variables")
-    return token
 # =============================================================================
-# HUGGINGFACE API RETRY WRAPPERS
 # =============================================================================
-def is_rate_limit_error(e):
-    """Check if exception is a HuggingFace rate limit error (429)."""
     if isinstance(e, HfHubHTTPError):
-        return e.response.status_code == 429
-    return False
-def backoff_handler(details):
-    """Handler to print retry attempt information."""
-    wait_time = details['wait']
-    tries = details['tries']
-    wait_minutes = wait_time / 60
-    print(f"   ⏳ Rate limited. Retrying in {wait_minutes:.1f} minutes ({wait_time:.0f}s) - attempt {tries}/8...")
 @backoff.on_exception(
     backoff.expo,
-    HfHubHTTPError,
-    giveup=lambda e: not is_rate_limit_error(e),
     max_tries=8,
-    base=300,  # Start at 5 minutes (300 seconds)
-    max_value=3600,  # Cap at 60 minutes (3600 seconds)
-    jitter=backoff.full_jitter,
-    on_backoff=backoff_handler
 )
 def list_repo_files_with_backoff(api, **kwargs):
-    """Wrapper for HfApi.list_repo_files with exponential backoff on rate limits."""
     return api.list_repo_files(**kwargs)
 @backoff.on_exception(
     backoff.expo,
-    HfHubHTTPError,
-    giveup=lambda e: not is_rate_limit_error(e),
     max_tries=8,
-    base=300,  # Start at 5 minutes (300 seconds)
-    max_value=3600,  # Cap at 60 minutes (3600 seconds)
-    jitter=backoff.full_jitter,
-    on_backoff=backoff_handler
 )
 def hf_hub_download_with_backoff(**kwargs):
-    """Wrapper for hf_hub_download with exponential backoff on rate limits."""
     return hf_hub_download(**kwargs)
 @backoff.on_exception(
     backoff.expo,
-    HfHubHTTPError,
-    giveup=lambda e: not is_rate_limit_error(e),
     max_tries=8,
-    base=300,  # Start at 5 minutes (300 seconds)
-    max_value=3600,  # Cap at 60 minutes (3600 seconds)
-    jitter=backoff.full_jitter,
-    on_backoff=backoff_handler
 )
-def upload_folder_with_backoff(api, **kwargs):
-    """Wrapper for HfApi.upload_folder with exponential backoff on rate limits."""
-    return api.upload_folder(**kwargs)
 @backoff.on_exception(
     backoff.expo,
-    HfHubHTTPError,
-    giveup=lambda e: not is_rate_limit_error(e),
     max_tries=8,
-    base=300,  # Start at 5 minutes (300 seconds)
-    max_value=3600,  # Cap at 60 minutes (3600 seconds)
-    jitter=backoff.full_jitter,
-    on_backoff=backoff_handler
 )
-def upload_file_with_backoff(api, **kwargs):
-    """Wrapper for HfApi.upload_file with exponential backoff on rate limits."""
-    return api.upload_file(**kwargs)
-# =============================================================================
-# BIGQUERY FUNCTIONS
-# =============================================================================
-def fetch_issue_metadata_batched(client, identifiers, start_date, end_date, batch_size=100, upload_immediately=True):
     """
-    Fetch issue metadata for ALL agents using BATCHED BigQuery queries.
-    Splits agents into smaller batches to avoid performance issues with large numbers of agents.
-    Args:
-        client: BigQuery client instance
-        identifiers: List of GitHub usernames/bot identifiers
-        start_date: Start datetime (timezone-aware)
-        end_date: End datetime (timezone-aware)
-        batch_size: Number of agents to process per batch (default: 100)
-        upload_immediately: If True, upload each batch's results to HuggingFace immediately (default: True)
     Returns:
-        Dictionary mapping agent identifier to list of issue metadata
     """
-    # Split identifiers into batches
-    batches = [identifiers[i:i + batch_size] for i in range(0, len(identifiers), batch_size)]
-    total_batches = len(batches)
-    print(f"\n🔍 Using BATCHED approach for {len(identifiers)} agents")
-    print(f"   Total batches: {total_batches} (batch size: {batch_size})")
-    print(f"   Time range: {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}")
-    if upload_immediately:
-        print(f"   Upload mode: Immediate (after each batch)")
-    else:
-        print(f"   Upload mode: Deferred (all at once)")
-    # Collect results from all batches
-    all_metadata = {}
-    for batch_num, batch_identifiers in enumerate(batches, 1):
-        print(f"\n📦 Processing batch {batch_num}/{total_batches} ({len(batch_identifiers)} agents)...")
-        try:
-            # Query each batch
-            batch_results = fetch_all_pr_metadata_single_query(
-                client, batch_identifiers, start_date, end_date
-            )
-            # Merge results
-            for identifier, metadata_list in batch_results.items():
-                if identifier in all_metadata:
-                    all_metadata[identifier].extend(metadata_list)
-                else:
-                    all_metadata[identifier] = metadata_list
-            print(f"   ✓ Batch {batch_num}/{total_batches} complete")
-            # Upload immediately after this batch if enabled
-            if upload_immediately and batch_results:
-                print(f"\n   📤 Uploading batch {batch_num}/{total_batches} results to HuggingFace...")
-                upload_success = 0
-                upload_errors = 0
-                for identifier, metadata_list in batch_results.items():
-                    if metadata_list:
-                        if save_pr_metadata_to_hf(metadata_list, identifier):
-                            upload_success += 1
-                        else:
-                            upload_errors += 1
-                print(f"   ✓ Batch {batch_num}/{total_batches} upload complete ({upload_success} agents uploaded, {upload_errors} errors)")
-        except Exception as e:
-            print(f"   ✗ Batch {batch_num}/{total_batches} failed: {str(e)}")
-            print(f"   Continuing with remaining batches...")
-            continue
-    total_prs = sum(len(metadata_list) for metadata_list in all_metadata.values())
-    print(f"\n✓ All batches complete! Found {total_prs} total PRs across {len(all_metadata)} agents")
-    return all_metadata
-def fetch_all_pr_metadata_single_query(client, identifiers, start_date, end_date):
-    """
-    Fetch PR metadata for a BATCH of agents using ONE comprehensive BigQuery query.
-    NOTE: This function is designed for smaller batches (~100 agents).
-    For large numbers of agents, use fetch_issue_metadata_batched() instead.
     This query fetches:
     1. PRs authored by agents (user.login matches identifier)
     Args:
-        client: BigQuery client instance
         identifiers: List of GitHub usernames/bot identifiers
         start_date: Start datetime (timezone-aware)
         end_date: End datetime (timezone-aware)
@@ -303,7 +412,7 @@ def fetch_all_pr_metadata_single_query(client, identifiers, start_date, end_date
         {
             'agent-identifier': [
                 {
-                    'url': PR URL,
                     'created_at': Creation timestamp,
                     'merged_at': Merge timestamp (if merged, else None),
                     'closed_at': Close timestamp (if closed but not merged, else None)
@@ -313,35 +422,30 @@ def fetch_all_pr_metadata_single_query(client, identifiers, start_date, end_date
             ...
         }
     """
-    print(f"   Querying BigQuery for {len(identifiers)} agents in this batch...")
-    print(f"   Time range: {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}")
-    # Generate table UNION statements for the time range
-    table_union = generate_table_union_statements(start_date, end_date)
-    # Build identifier list for SQL IN clause (author matching only)
-    author_list = ', '.join([f"'{id}'" for id in identifiers])
-    # Build comprehensive query with CTE
     query = f"""
     WITH pr_events AS (
       -- Get all PR events (opened, closed) for all agents
       SELECT
-        JSON_EXTRACT_SCALAR(payload, '$.pull_request.html_url') as url,
-        JSON_EXTRACT_SCALAR(payload, '$.pull_request.user.login') as pr_author,
-        JSON_EXTRACT_SCALAR(payload, '$.pull_request.created_at') as created_at,
-        CAST(JSON_EXTRACT_SCALAR(payload, '$.pull_request.merged') AS BOOL) as is_merged,
-        JSON_EXTRACT_SCALAR(payload, '$.pull_request.merged_at') as merged_at,
-        JSON_EXTRACT_SCALAR(payload, '$.pull_request.closed_at') as closed_at,
-        JSON_EXTRACT_SCALAR(payload, '$.action') as action,
         created_at as event_time
-      FROM (
-        {table_union}
-      ) t
       WHERE
-        type = 'PullRequestEvent'
-        AND JSON_EXTRACT_SCALAR(payload, '$.pull_request.html_url') IS NOT NULL
-        AND JSON_EXTRACT_SCALAR(payload, '$.pull_request.user.login') IN ({author_list})
     ),
     pr_latest_state AS (
@@ -368,72 +472,77 @@ def fetch_all_pr_metadata_single_query(client, identifiers, start_date, end_date
     ORDER BY created_at DESC
     """
-    print(f"   Scanning {(end_date - start_date).days} days of GitHub Archive data...")
-    print(f"   Batch agents: {', '.join(identifiers[:5])}{'...' if len(identifiers) > 5 else ''}")
     try:
-        query_job = client.query(query)
-        results = list(query_job.result())
-        print(f"   ✓ Found {len(results)} PRs in this batch")
         # Group results by agent
         metadata_by_agent = defaultdict(list)
         for row in results:
-            # Convert datetime objects to ISO strings
-            created_at = row.created_at
-            if hasattr(created_at, 'isoformat'):
-                created_at = created_at.isoformat()
-            merged_at = row.merged_at
-            if hasattr(merged_at, 'isoformat'):
-                merged_at = merged_at.isoformat()
-            closed_at = row.closed_at
-            if hasattr(closed_at, 'isoformat'):
-                closed_at = closed_at.isoformat()
-            pr_data = {
-                'html_url': row.url,
                 'created_at': created_at,
                 'merged_at': merged_at,
                 'closed_at': closed_at,
-            }
-            # Assign to agent based on author
-            pr_author = row.pr_author
-            if pr_author and pr_author in identifiers:
-                metadata_by_agent[pr_author].append(pr_data)
-        # Print breakdown by agent (only show agents with PRs)
-        print(f"   📊 Batch breakdown:")
-        for identifier in identifiers:
-            count = len(metadata_by_agent.get(identifier, []))
-            if count > 0:
-                metadata = metadata_by_agent[identifier]
-                merged_count = sum(1 for m in metadata if m['merged_at'] is not None)
-                closed_count = sum(1 for m in metadata if m['closed_at'] is not None and m['merged_at'] is None)
-                open_count = count - merged_count - closed_count
-                print(f"      {identifier}: {count} PRs ({merged_count} merged, {closed_count} closed, {open_count} open)")
         # Convert defaultdict to regular dict
         return dict(metadata_by_agent)
     except Exception as e:
-        print(f"   ✗ BigQuery error: {str(e)}")
         import traceback
         traceback.print_exc()
         return {}
 # =============================================================================
-# HUGGINGFACE STORAGE FUNCTIONS
 # =============================================================================
 def group_metadata_by_date(metadata_list):
     """
-    Group PR metadata by exact date (year.month.day) for efficient daily storage.
     Returns dict: {(year, month, day): [metadata_list]}
     """
     grouped = defaultdict(list)
@@ -453,20 +562,56 @@ def group_metadata_by_date(metadata_list):
     return dict(grouped)
-def save_pr_metadata_to_hf(metadata_list, agent_identifier):
     """
-    Save PR metadata to HuggingFace dataset, organized by [agent_identifier]/YYYY.MM.DD.jsonl.
-    Each file is stored in the agent's folder and named YYYY.MM.DD.jsonl for that day's PRs.
-    This function OVERWRITES existing files completely with fresh data from BigQuery.
-    Uses batch upload to avoid rate limit (uploads entire folder in single operation).
     Args:
-        metadata_list: List of PR metadata dictionaries
-        agent_identifier: GitHub identifier of the agent (used as folder name)
     """
-    import shutil
     try:
         token = get_hf_token()
         if not token:
@@ -474,56 +619,89 @@ def save_pr_metadata_to_hf(metadata_list, agent_identifier):
         api = HfApi(token=token)
-        # Group by date (year, month, day)
-        grouped = group_metadata_by_date(metadata_list)
-        if not grouped:
-            print(f"   No valid metadata to save for {agent_identifier}")
-            return False
-        # Create a temporary directory for batch upload
-        temp_dir = tempfile.mkdtemp()
-        agent_folder = os.path.join(temp_dir, agent_identifier)
-        os.makedirs(agent_folder, exist_ok=True)
-        try:
-            print(f"   📦 Preparing batch upload for {len(grouped)} daily files...")
-            # Process each daily file
-            for (pr_year, month, day), day_metadata in grouped.items():
-                filename = f"{agent_identifier}/{pr_year}.{month:02d}.{day:02d}.jsonl"
-                local_filename = os.path.join(agent_folder, f"{pr_year}.{month:02d}.{day:02d}.jsonl")
-                # Sort by created_at for better organization
-                day_metadata.sort(key=lambda x: x.get('created_at', ''), reverse=True)
-                # Save to temp directory (complete overwrite, no merging)
-                save_jsonl(local_filename, day_metadata)
-                print(f"      Prepared {len(day_metadata)} PRs for {filename}")
-            # Upload entire folder using upload_folder (single commit per agent)
-            print(f"   📤 Uploading {len(grouped)} files ({len(metadata_list)} total PRs)...")
-            upload_folder_with_backoff(
-                api,
-                folder_path=temp_dir,
-                repo_id=PR_METADATA_REPO,
-                repo_type="dataset",
-                commit_message=f"Update PR metadata for {agent_identifier}"
-            )
-            print(f"   ✓ Batch upload complete for {agent_identifier}")
-            return True
-        finally:
-            # Always clean up temp directory
-            if os.path.exists(temp_dir):
-                shutil.rmtree(temp_dir)
     except Exception as e:
-        print(f"   ✗ Error saving PR metadata: {str(e)}")
         import traceback
         traceback.print_exc()
-        return False
 def load_agents_from_hf():
@@ -537,13 +715,11 @@ def load_agents_from_hf():
         agents = []
         # List all files in the repository
-        files = list_repo_files_with_backoff(api, repo_id=AGENTS_REPO, repo_type="dataset")
         # Filter for JSON files only
         json_files = [f for f in files if f.endswith('.json')]
-        print(f"Found {len(json_files)} agent files in {AGENTS_REPO}")
         # Download and parse each JSON file
         for json_file in json_files:
             try:
@@ -567,10 +743,11 @@ def load_agents_from_hf():
                     agents.append(agent_data)
             except Exception as e:
-                print(f"Warning: Could not load {json_file}: {str(e)}")
                 continue
-        print(f"✓ Loaded {len(agents)} agents from HuggingFace")
         return agents
     except Exception as e:
@@ -609,46 +786,54 @@ def calculate_pr_stats_from_metadata(metadata_list):
     }
-def calculate_monthly_metrics(all_metadata, agents):
     """
     Calculate monthly metrics for all agents for visualization.
     Args:
-        all_metadata: List of all PR metadata with agent_identifier field
-        agents: List of agent data dictionaries
     Returns:
-        dict with monthly metrics organized by agent
     """
-    from datetime import datetime, timezone
     # Create mapping from agent_identifier to agent_name
-    identifier_to_name = {
-        agent.get('github_identifier'): agent.get('name', 'Unknown')
-        for agent in agents
-        if agent.get('github_identifier')
-    }
     # Group by agent and month
     agent_month_data = defaultdict(lambda: defaultdict(list))
-    for pr_meta in all_metadata:
-        agent_identifier = pr_meta.get('agent_identifier')
-        created_at = pr_meta.get('created_at')
-        if not agent_identifier or not created_at:
-            continue
-        # Get agent_name from identifier
-        agent_name = identifier_to_name.get(agent_identifier, agent_identifier)
-        try:
-            dt = datetime.fromisoformat(created_at.replace('Z', '+00:00'))
-            month_key = f"{dt.year}-{dt.month:02d}"
-            agent_month_data[agent_name][month_key].append(pr_meta)
-        except Exception as e:
-            print(f"Warning: Could not parse date '{created_at}': {e}")
-            continue
     # Get all unique months and sort them
     all_months = set()
@@ -660,8 +845,8 @@ def calculate_monthly_metrics(all_metadata, agents):
     result_data = {}
     for agent_name, month_dict in agent_month_data.items():
         acceptance_rates = []
-        total_prs = []
-        merged_prs = []
         closed_not_merged_list = []
         for month in months:
@@ -682,14 +867,14 @@ def calculate_monthly_metrics(all_metadata, agents):
             acceptance_rate = (merged_count / total_decisions * 100) if total_decisions > 0 else None
             acceptance_rates.append(acceptance_rate)
-            total_prs.append(total_count)
-            merged_prs.append(merged_count)
             closed_not_merged_list.append(closed_not_merged_count)
         result_data[agent_name] = {
             'acceptance_rates': acceptance_rates,
-            'total_prs': total_prs,
-            'merged_prs': merged_prs,
             'closed_not_merged': closed_not_merged_list
         }
@@ -702,113 +887,36 @@ def calculate_monthly_metrics(all_metadata, agents):
     }
-def load_all_pr_metadata_from_hf(agents):
     """
-    Load all PR metadata from HuggingFace dataset for all agents.
     Args:
-        agents: List of agent dictionaries with github_identifier
     Returns:
-        List of PR metadata with agent_identifier field added
-    """
-    try:
-        api = HfApi()
-        token = get_hf_token()
-        # Calculate cutoff date
-        cutoff_date = datetime.now(timezone.utc) - timedelta(days=LEADERBOARD_TIME_FRAME_DAYS)
-        # List all files in the repository
-        files = list_repo_files_with_backoff(api, repo_id=PR_METADATA_REPO, repo_type="dataset")
-        # Filter for files within the time frame
-        relevant_files = []
-        for f in files:
-            if f.endswith('.jsonl'):
-                parts = f.split('/')
-                if len(parts) == 2:
-                    filename = parts[1]
-                    try:
-                        date_part = filename.replace('.jsonl', '')
-                        date_components = date_part.split('.')
-                        if len(date_components) == 3:
-                            file_year, file_month, file_day = map(int, date_components)
-                            file_date = datetime(file_year, file_month, file_day, tzinfo=timezone.utc)
-                            if file_date >= cutoff_date:
-                                relevant_files.append(f)
-                    except Exception:
-                        continue
-        print(f"\n📥 Loading PR metadata from {len(relevant_files)} daily files...")
-        all_metadata = []
-        for filename in relevant_files:
-            try:
-                parts = filename.split('/')
-                if len(parts) != 2:
-                    continue
-                agent_identifier = parts[0]
-                file_path = hf_hub_download_with_backoff(
-                    repo_id=PR_METADATA_REPO,
-                    filename=filename,
-                    repo_type="dataset",
-                    token=token
-                )
-                day_metadata = load_jsonl(file_path)
-                # Add agent_identifier to each PR
-                for pr_meta in day_metadata:
-                    created_at = pr_meta.get('created_at')
-                    if created_at:
-                        try:
-                            dt = datetime.fromisoformat(created_at.replace('Z', '+00:00'))
-                            if dt >= cutoff_date:
-                                pr_meta['agent_identifier'] = agent_identifier
-                                all_metadata.append(pr_meta)
-                        except Exception:
-                            continue
-            except Exception as e:
-                print(f"   Warning: Could not load {filename}: {str(e)}")
-        print(f"✓ Loaded {len(all_metadata)} total PRs")
-        return all_metadata
-    except Exception as e:
-        print(f"✗ Error loading PR metadata: {str(e)}")
-        return []
-def construct_leaderboard_from_metadata(all_metadata, agents):
     """
-    Construct leaderboard data from PR metadata.
-    Args:
-        all_metadata: List of PR metadata with agent_identifier field
-        agents: List of agent dictionaries
-    Returns:
-        Dictionary mapping agent identifier to stats
-    """
     cache_dict = {}
     for agent in agents:
         identifier = agent.get('github_identifier')
         agent_name = agent.get('name', 'Unknown')
-        # Filter metadata for this agent
-        bot_metadata = [pr for pr in all_metadata if pr.get('agent_identifier') == identifier]
         # Calculate stats
         stats = calculate_pr_stats_from_metadata(bot_metadata)
         cache_dict[identifier] = {
             'name': agent_name,
-            'website': agent.get('website', 'Unknown'),
             'github_identifier': identifier,
             **stats
         }
@@ -816,16 +924,16 @@ def construct_leaderboard_from_metadata(all_metadata, agents):
     return cache_dict
-def save_leaderboard_data_to_hf(leaderboard_data, monthly_metrics):
     """
-    Save computed leaderboard and monthly metrics to HuggingFace dataset as swe-pr.json.
     Args:
-        leaderboard_data: Dictionary with agent stats (from construct_leaderboard)
-        monthly_metrics: Dictionary with monthly metrics (from calculate_monthly_metrics)
     Returns:
-        True if successful, False otherwise
     """
     try:
         token = get_hf_token()
@@ -833,39 +941,39 @@ def save_leaderboard_data_to_hf(leaderboard_data, monthly_metrics):
             raise Exception("No HuggingFace token found")
         api = HfApi(token=token)
-        # Combine data into single JSON structure
         combined_data = {
-            'leaderboard': leaderboard_data,
             'monthly_metrics': monthly_metrics,
-            'last_updated': datetime.now(timezone.utc).isoformat()
         }
-        # Save to temp file
-        temp_file = tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.json')
-        try:
-            json.dump(combined_data, temp_file, indent=2)
-            temp_file.close()
-            # Upload to HuggingFace
-            print(f"\n📤 Uploading leaderboard data to {LEADERBOARD_REPO}/swe-pr.json...")
             upload_file_with_backoff(
-                api,
-                path_or_fileobj=temp_file.name,
-                path_in_repo="swe-pr.json",
                 repo_id=LEADERBOARD_REPO,
                 repo_type="dataset"
             )
-            print(f"✓ Leaderboard data uploaded successfully")
             return True
         finally:
-            # Clean up temp file
-            if os.path.exists(temp_file.name):
-                os.unlink(temp_file.name)
     except Exception as e:
-        print(f"✗ Error saving leaderboard data: {str(e)}")
         import traceback
         traceback.print_exc()
         return False
@@ -878,31 +986,35 @@ def save_leaderboard_data_to_hf(leaderboard_data, monthly_metrics):
 def mine_all_agents():
     """
     Mine PR metadata for all agents within LEADERBOARD_TIME_FRAME_DAYS and save to HuggingFace.
-    Uses ONE BigQuery query for ALL agents (most efficient approach).
     """
-    # Load agent metadata from HuggingFace
     agents = load_agents_from_hf()
     if not agents:
-        print("No agents found in HuggingFace dataset")
         return
     # Extract all identifiers
     identifiers = [agent['github_identifier'] for agent in agents if agent.get('github_identifier')]
     if not identifiers:
-        print("No valid agent identifiers found")
         return
-    print(f"\n{'='*80}")
-    print(f"Starting PR metadata mining for {len(identifiers)} agents")
-    print(f"Time frame: Last {LEADERBOARD_TIME_FRAME_DAYS} days")
-    print(f"Data source: BigQuery + GitHub Archive (BATCHED QUERIES)")
-    print(f"{'='*80}\n")
-    # Initialize BigQuery client
     try:
-        client = get_bigquery_client()
     except Exception as e:
-        print(f"✗ Failed to initialize BigQuery client: {str(e)}")
         return
     # Define time range: past LEADERBOARD_TIME_FRAME_DAYS (excluding today)
@@ -911,68 +1023,116 @@ def mine_all_agents():
     start_date = end_date - timedelta(days=LEADERBOARD_TIME_FRAME_DAYS)
     try:
-        # Use batched approach for better performance
-        # upload_immediately=True means each batch uploads to HuggingFace right after BigQuery completes
-        all_metadata = fetch_issue_metadata_batched(
-            client, identifiers, start_date, end_date, batch_size=100, upload_immediately=True
         )
         # Calculate summary statistics
         total_prs = sum(len(metadata_list) for metadata_list in all_metadata.values())
         agents_with_data = sum(1 for metadata_list in all_metadata.values() if metadata_list)
-        print(f"\n{'='*80}")
-        print(f"✅ BigQuery mining and upload complete!")
-        print(f"   Total agents: {len(agents)}")
-        print(f"   Agents with data: {agents_with_data}")
-        print(f"   Total PRs found: {total_prs}")
-        print(f"{'='*80}\n")
     except Exception as e:
-        print(f"✗ Error during BigQuery fetch: {str(e)}")
         import traceback
         traceback.print_exc()
         return
-    # Compute and save leaderboard data
-    print(f"\n{'='*80}")
-    print(f"📊 Computing leaderboard and monthly metrics...")
-    print(f"{'='*80}\n")
     try:
-        # Load all PR metadata from HuggingFace
-        all_pr_metadata = load_all_pr_metadata_from_hf(agents)
-        if all_pr_metadata:
-            # Construct leaderboard
-            leaderboard_data = construct_leaderboard_from_metadata(all_pr_metadata, agents)
-            print(f"✓ Computed leaderboard for {len(leaderboard_data)} agents")
-            # Calculate monthly metrics
-            monthly_metrics = calculate_monthly_metrics(all_pr_metadata, agents)
-            print(f"✓ Computed monthly metrics for {len(monthly_metrics['agents'])} agents across {len(monthly_metrics['months'])} months")
-            # Save to HuggingFace
-            if save_leaderboard_data_to_hf(leaderboard_data, monthly_metrics):
-                print(f"\n{'='*80}")
-                print(f"✅ Leaderboard data saved successfully!")
-                print(f"{'='*80}\n")
-            else:
-                print(f"\n{'='*80}")
-                print(f"⚠️  Warning: Failed to save leaderboard data")
-                print(f"{'='*80}\n")
-        else:
-            print(f"⚠️  No PR metadata found to compute leaderboard")
     except Exception as e:
-        print(f"✗ Error computing/saving leaderboard data: {str(e)}")
         import traceback
         traceback.print_exc()
 # =============================================================================
 # ENTRY POINT
 # =============================================================================
 if __name__ == "__main__":
-    mine_all_agents()

 """
 Minimalist PR Metadata Mining Script
+Mines PR metadata from locally downloaded GHArchive data via DuckDB and saves to HuggingFace dataset.
 """
 import json
 import os
+import time
 import tempfile
 from datetime import datetime, timezone, timedelta
 from collections import defaultdict
+from concurrent.futures import ThreadPoolExecutor, as_completed
 from huggingface_hub import HfApi, hf_hub_download
 from huggingface_hub.errors import HfHubHTTPError
 from dotenv import load_dotenv
+import duckdb
 import backoff
+import requests
+import requests.exceptions
+from apscheduler.schedulers.blocking import BlockingScheduler
+from apscheduler.triggers.cron import CronTrigger
+import logging
 # Load environment variables
 load_dotenv()
 AGENTS_REPO = "SWE-Arena/bot_metadata"
 PR_METADATA_REPO = "SWE-Arena/pr_metadata"
+LEADERBOARD_REPO = "SWE-Arena/leaderboard_metadata"  # HuggingFace dataset for leaderboard data
+LEADERBOARD_TIME_FRAME_DAYS = 180  # Time frame for leaderboard
+GHARCHIVE_DATA_DIR = "../gharchive/data"  # Local GHArchive data directory
+DUCKDB_CACHE_FILE = "../gharchive/gharchive_cache.duckdb"  # Persistent DuckDB database for caching
+# Download configuration
+DOWNLOAD_WORKERS = 48  # Number of parallel download threads
+DOWNLOAD_RETRY_DELAY = 2  # Initial retry delay in seconds
+MAX_RETRIES = 5  # Maximum number of retries for each API call
+# Upload configuration
+UPLOAD_DELAY_SECONDS = 5  # Delay between individual file uploads to avoid rate limits
+UPLOAD_INITIAL_BACKOFF = 60  # Initial backoff time in seconds (1 minute)
+UPLOAD_MAX_BACKOFF = 3600  # Maximum backoff time in seconds (60 minutes)
+# Scheduler configuration
+SCHEDULE_ENABLED = False  # Enable/disable scheduler
+SCHEDULE_DAY_OF_MONTH = 8  # Day of month (1-31) - 8nd is in the second week
+SCHEDULE_HOUR = 0  # Hour (0-23) - 12am midnight
+SCHEDULE_MINUTE = 0  # Minute (0-59)
+SCHEDULE_TIMEZONE = 'UTC'  # Timezone for scheduling
 # =============================================================================
 # UTILITY FUNCTIONS
             f.write(json.dumps(item) + '\n')
+def normalize_date_format(date_string):
     """
+    Convert date strings to standardized ISO 8601 format with Z suffix.
+    Handles both 'T' and space-separated datetime formats (including newlines).
+    Examples:
+    - 2025-10-15T23:23:47.983068 -> 2025-10-15T23:23:47Z
+    - 2025-06-17 21:21:07+00 -> 2025-06-17T21:21:07Z
     """
+    if not date_string or date_string == 'N/A':
+        return 'N/A'
+    try:
+        import re
+        # Remove all whitespace (spaces, newlines, tabs) and replace with single space
+        date_string = re.sub(r'\s+', ' ', date_string.strip())
+        # Replace space with 'T' for ISO format compatibility
+        date_string = date_string.replace(' ', 'T')
+        # Fix incomplete timezone offset (+00 or -00 -> +00:00 or -00:00)
+        # Check if timezone offset exists and is incomplete
+        if len(date_string) >= 3:
+            if date_string[-3:-2] in ('+', '-') and ':' not in date_string[-3:]:
+                date_string = date_string + ':00'
+        # Parse the date string (handles both with and without microseconds)
+        dt = datetime.fromisoformat(date_string.replace('Z', '+00:00'))
+        # Convert to standardized format
+        return dt.strftime('%Y-%m-%dT%H:%M:%SZ')
+    except Exception as e:
+        print(f"Warning: Could not parse date '{date_string}': {e}")
+        return date_string
+def get_hf_token():
+    """Get HuggingFace token from environment variables."""
+    token = os.getenv('HF_TOKEN')
+    if not token:
+        print("Warning: HF_TOKEN not found in environment variables")
+    return token
+# =============================================================================
+# GHARCHIVE DOWNLOAD FUNCTIONS
+# =============================================================================
+def download_file(url):
     """
+    Download a GHArchive file with retry logic.
     Args:
+        url: URL to download
     Returns:
+        bool: True if successful, False otherwise
     """
+    filename = url.split("/")[-1]
+    filepath = os.path.join(GHARCHIVE_DATA_DIR, filename)
+    # Skip if json.gz already exists
+    if os.path.exists(filepath):
+        return True
+    # Download with retry logic
+    for attempt in range(MAX_RETRIES):
+        try:
+            response = requests.get(url, timeout=30)
+            response.raise_for_status()
+            with open(filepath, "wb") as f:
+                f.write(response.content)
+            return True
+        except requests.exceptions.HTTPError as e:
+            if e.response.status_code == 404:
+                # File doesn't exist, don't retry
+                return False
+            else:
+                # Other HTTP errors, retry
+                if attempt < MAX_RETRIES - 1:
+                    wait_time = DOWNLOAD_RETRY_DELAY * (2 ** attempt)  # Exponential backoff
+                    print(f"   ⚠ {filename}: HTTP error {e.response.status_code}, retrying in {wait_time}s (attempt {attempt + 1}/{MAX_RETRIES})")
+                    time.sleep(wait_time)
+                else:
+                    print(f"   ✗ {filename}: Failed after {MAX_RETRIES} attempts - {e}")
+        except (requests.exceptions.Timeout,
+                requests.exceptions.ConnectionError,
+                requests.exceptions.ReadTimeout) as e:
+            # Timeout/connection errors, retry
+            if attempt < MAX_RETRIES - 1:
+                wait_time = DOWNLOAD_RETRY_DELAY * (2 ** attempt)  # Exponential backoff
+                print(f"   ⚠ {filename}: {type(e).__name__}, retrying in {wait_time}s (attempt {attempt + 1}/{MAX_RETRIES})")
+                time.sleep(wait_time)
+            else:
+                print(f"   ✗ {filename}: Failed after {MAX_RETRIES} attempts - {type(e).__name__}")
+        except Exception as e:
+            # Other errors, retry
+            if attempt < MAX_RETRIES - 1:
+                wait_time = DOWNLOAD_RETRY_DELAY * (2 ** attempt)
+                print(f"   ⚠ {filename}: {e}, retrying in {wait_time}s (attempt {attempt + 1}/{MAX_RETRIES})")
+                time.sleep(wait_time)
+            else:
+                print(f"   ✗ {filename}: Failed after {MAX_RETRIES} attempts - {e}")
+    return False
+def download_all_gharchive_data():
+    """
+    Download all GHArchive data files for the last LEADERBOARD_TIME_FRAME_DAYS.
+    Uses parallel downloads with ThreadPoolExecutor.
+    Returns:
+        bool: True if all downloads completed (some may have failed), False if critical error
+    """
+    # Create data directory if it doesn't exist
+    os.makedirs(GHARCHIVE_DATA_DIR, exist_ok=True)
+    # Generate URLs for last N days (hourly files: 0-23 for each day)
+    end_date = datetime.now()
+    start_date = end_date - timedelta(days=LEADERBOARD_TIME_FRAME_DAYS)
+    urls = []
+    current_date = start_date
+    while current_date <= end_date:
+        date_str = current_date.strftime("%Y-%m-%d")
+        # Generate hourly URLs for this day (0-23)
+        for hour in range(24):
+            url = f"https://data.gharchive.org/{date_str}-{hour}.json.gz"
+            urls.append(url)
+        current_date += timedelta(days=1)
+    downloads_processed = 0
+    try:
+        with ThreadPoolExecutor(max_workers=DOWNLOAD_WORKERS) as executor:
+            # Submit all downloads
+            futures = [executor.submit(download_file, url) for url in urls]
+            # Wait for downloads to complete
+            for future in as_completed(futures):
+                downloads_processed += 1
+        print(f"Download complete: {downloads_processed} files")
+        return True
+    except Exception as e:
+        print(f"Error during download: {str(e)}")
+        import traceback
+        traceback.print_exc()
+        return False
 # =============================================================================
+# HUGGINGFACE API WRAPPERS WITH ENHANCED BACKOFF
 # =============================================================================
+def is_retryable_error(e):
+    """
+    Check if exception is retryable (rate limit or timeout error).
+    """
+    # Check for rate limit error (429)
     if isinstance(e, HfHubHTTPError):
+        if e.response.status_code == 429:
+            return True
+    # Check for timeout errors
+    if isinstance(e, (requests.exceptions.Timeout,
+                     requests.exceptions.ReadTimeout,
+                     requests.exceptions.ConnectTimeout)):
+        return True
+    # Check if it's a timeout error wrapped in HfHubHTTPError
+    if isinstance(e, Exception):
+        error_str = str(e).lower()
+        if 'timeout' in error_str or 'timed out' in error_str:
+            return True
+    return False
 @backoff.on_exception(
     backoff.expo,
+    (HfHubHTTPError, requests.exceptions.Timeout, requests.exceptions.RequestException, Exception),
     max_tries=8,
+    base=300,
+    max_value=3600,
+    giveup=lambda e: not is_retryable_error(e),
+    on_backoff=lambda details: print(
+        f"   {details['exception']} error. Retrying in {details['wait']/60:.1f} minutes ({details['wait']:.0f}s) - attempt {details['tries']}/8..."
+    )
 )
 def list_repo_files_with_backoff(api, **kwargs):
+    """Wrapper for api.list_repo_files() with exponential backoff for retryable errors."""
     return api.list_repo_files(**kwargs)
 @backoff.on_exception(
     backoff.expo,
+    (HfHubHTTPError, requests.exceptions.Timeout, requests.exceptions.RequestException, Exception),
     max_tries=8,
+    base=300,
+    max_value=3600,
+    giveup=lambda e: not is_retryable_error(e),
+    on_backoff=lambda details: print(
+        f"   {details['exception']} error. Retrying in {details['wait']/60:.1f} minutes ({details['wait']:.0f}s) - attempt {details['tries']}/8..."
+    )
 )
 def hf_hub_download_with_backoff(**kwargs):
+    """Wrapper for hf_hub_download() with exponential backoff for retryable errors."""
     return hf_hub_download(**kwargs)
 @backoff.on_exception(
     backoff.expo,
+    (HfHubHTTPError, requests.exceptions.Timeout, requests.exceptions.RequestException, Exception),
     max_tries=8,
+    base=300,
+    max_value=3600,
+    giveup=lambda e: not is_retryable_error(e),
+    on_backoff=lambda details: print(
+        f"   {details['exception']} error. Retrying in {details['wait']/60:.1f} minutes ({details['wait']:.0f}s) - attempt {details['tries']}/8..."
+    )
 )
+def upload_file_with_backoff(api, **kwargs):
+    """Wrapper for api.upload_file() with exponential backoff for retryable errors."""
+    return api.upload_file(**kwargs)
 @backoff.on_exception(
     backoff.expo,
+    (HfHubHTTPError, requests.exceptions.Timeout, requests.exceptions.RequestException, Exception),
     max_tries=8,
+    base=300,
+    max_value=3600,
+    giveup=lambda e: not is_retryable_error(e),
+    on_backoff=lambda details: print(
+        f"   {details['exception']} error. Retrying in {details['wait']/60:.1f} minutes ({details['wait']:.0f}s) - attempt {details['tries']}/8..."
+    )
 )
+def upload_folder_with_backoff(api, **kwargs):
+    """Wrapper for api.upload_folder() with exponential backoff for retryable errors."""
+    return api.upload_folder(**kwargs)
+def get_duckdb_connection():
     """
+    Initialize DuckDB connection with persistent database and optimized parallelization.
     Returns:
+        DuckDB connection object
     """
+    # Use persistent database for caching results
+    conn = duckdb.connect(DUCKDB_CACHE_FILE)
+    # Optimize for 96-core CPU parallelization with 754GB RAM
+    conn.execute("SET threads TO 8;")  # Use all available cores
+    conn.execute("SET preserve_insertion_order = false;")  # Better parallelization
+    conn.execute("SET enable_object_cache = true;")  # Cache objects for reuse
+    conn.execute("SET temp_directory = '/tmp/duckdb_temp';")  # Use fast temp storage if needed
+    return conn
+def generate_file_path_patterns(start_date, end_date, data_dir=GHARCHIVE_DATA_DIR):
+    """
+    Generate file path patterns for GHArchive data in date range.
+    Only includes files that actually exist on disk.
+    Args:
+        start_date: Start datetime
+        end_date: End datetime
+        data_dir: Directory containing GHArchive data files
+    Returns:
+        List of file path patterns (hourly JSON.gz files) that exist
+    """
+    file_patterns = []
+    missing_dates = set()
+    current_date = start_date.replace(hour=0, minute=0, second=0, microsecond=0)
+    end_day = end_date.replace(hour=0, minute=0, second=0, microsecond=0)
+    while current_date <= end_day:
+        # Pattern for hourly JSON.gz files: 2024-11-15-{0..23}.json.gz
+        date_has_files = False
+        for hour in range(24):
+            pattern = os.path.join(data_dir, f"{current_date.strftime('%Y-%m-%d')}-{hour}.json.gz")
+            # Only add pattern if file exists
+            if os.path.exists(pattern):
+                file_patterns.append(pattern)
+                date_has_files = True
+        # Track missing dates
+        if not date_has_files:
+            missing_dates.add(current_date.strftime('%Y-%m-%d'))
+        # Move to next day
+        current_date += timedelta(days=1)
+    # Print warning about missing dates
+    if missing_dates:
+        print(f"   Warning: Skipping {len(missing_dates)} date(s) with no data files: {', '.join(sorted(missing_dates))}")
+    return file_patterns
+# =============================================================================
+# DUCKDB QUERY FUNCTIONS
+# =============================================================================
+def fetch_all_pr_metadata_single_query(conn, identifiers, start_date, end_date):
+    """
+    Fetch PR metadata for ALL agents using ONE comprehensive DuckDB query.
     This query fetches:
     1. PRs authored by agents (user.login matches identifier)
+    2. PR status (opened, merged, closed)
     Args:
+        conn: DuckDB connection instance
         identifiers: List of GitHub usernames/bot identifiers
         start_date: Start datetime (timezone-aware)
         end_date: End datetime (timezone-aware)
         {
             'agent-identifier': [
                 {
+                    'html_url': PR URL,
                     'created_at': Creation timestamp,
                     'merged_at': Merge timestamp (if merged, else None),
                     'closed_at': Close timestamp (if closed but not merged, else None)
             ...
         }
     """
+    # Generate file path patterns for the time range
+    file_patterns = generate_file_path_patterns(start_date, end_date)
+    # Build identifier list for IN clause
+    identifier_list = ', '.join([f"'{id}'" for id in identifiers])
+    # Build comprehensive query with CTEs using parameterized file lists (JSON.gz format)
     query = f"""
     WITH pr_events AS (
       -- Get all PR events (opened, closed) for all agents
       SELECT
+        TRY_CAST(json_extract_string(payload, '$.pull_request.html_url') AS VARCHAR) as url,
+        TRY_CAST(json_extract_string(payload, '$.pull_request.user.login') AS VARCHAR) as pr_author,
+        TRY_CAST(json_extract_string(payload, '$.pull_request.created_at') AS VARCHAR) as created_at,
+        TRY_CAST(json_extract_string(payload, '$.pull_request.merged') AS BOOLEAN) as is_merged,
+        TRY_CAST(json_extract_string(payload, '$.pull_request.merged_at') AS VARCHAR) as merged_at,
+        TRY_CAST(json_extract_string(payload, '$.pull_request.closed_at') AS VARCHAR) as closed_at,
+        TRY_CAST(json_extract_string(payload, '$.action') AS VARCHAR) as action,
         created_at as event_time
+      FROM read_json($file_patterns, union_by_name=true, filename=true, compression='gzip', format='newline_delimited', ignore_errors=true, maximum_object_size=2147483648)
       WHERE
+        TRY_CAST(type AS VARCHAR) = 'PullRequestEvent'
+        AND json_extract_string(payload, '$.pull_request.html_url') IS NOT NULL
+        AND TRY_CAST(json_extract_string(payload, '$.pull_request.user.login') AS VARCHAR) IN ({identifier_list})
     ),
     pr_latest_state AS (
     ORDER BY created_at DESC
     """
     try:
+        # Create cache table name based on date range
+        cache_table_name = f"pr_cache_{start_date.strftime('%Y%m%d')}_{end_date.strftime('%Y%m%d')}"
+        # Check if cache exists and is valid
+        cache_exists = conn.execute(f"""
+            SELECT COUNT(*) FROM information_schema.tables
+            WHERE table_name = '{cache_table_name}'
+        """).fetchone()[0] > 0
+        if cache_exists:
+            results = conn.execute(f"""
+                SELECT url, pr_author, created_at, merged_at, closed_at
+                FROM {cache_table_name}
+                WHERE pr_author IN ({identifier_list})
+            """).fetchall()
+        else:
+            # Execute query with parameters
+            results = conn.execute(query, {'file_patterns': file_patterns}).fetchall()
+            # Cache the complete results for all future queries in this date range
+            if len(results) > 0:
+                conn.execute(f"""
+                    CREATE TABLE {cache_table_name} AS
+                    SELECT * FROM (
+                        SELECT UNNEST($1) as url, UNNEST($2) as pr_author,
+                               UNNEST($3) as created_at, UNNEST($4) as merged_at,
+                               UNNEST($5) as closed_at
+                    )
+                """, [
+                    [r[0] for r in results],
+                    [r[1] for r in results],
+                    [r[2] for r in results],
+                    [r[3] for r in results],
+                    [r[4] for r in results]
+                ])
         # Group results by agent
         metadata_by_agent = defaultdict(list)
         for row in results:
+            url = row[0]
+            pr_author = row[1]
+            created_at = normalize_date_format(row[2]) if row[2] else None
+            merged_at = normalize_date_format(row[3]) if row[3] else None
+            closed_at = normalize_date_format(row[4]) if row[4] else None
+            metadata_by_agent[pr_author].append({
+                'html_url': url,
                 'created_at': created_at,
                 'merged_at': merged_at,
                 'closed_at': closed_at,
+            })
         # Convert defaultdict to regular dict
         return dict(metadata_by_agent)
     except Exception as e:
+        print(f"DuckDB error: {str(e)}")
         import traceback
         traceback.print_exc()
         return {}
 # =============================================================================
+# HUGGINGFACE STORAGE FUNCTIONS WITH BATCH UPLOAD
 # =============================================================================
 def group_metadata_by_date(metadata_list):
     """
+    Group PR metadata by date (year.month.day) for daily storage.
     Returns dict: {(year, month, day): [metadata_list]}
     """
     grouped = defaultdict(list)
     return dict(grouped)
+def upload_single_file_with_retry(api, local_path, repo_path, repo_id, repo_type, commit_message, max_retries=MAX_RETRIES):
     """
+    Upload a single file with exponential backoff retry logic.
     Args:
+        api: HfApi instance
+        local_path: Local file path
+        repo_path: Path in repository
+        repo_id: Repository ID
+        repo_type: Repository type (e.g., "dataset")
+        commit_message: Commit message
+        max_retries: Maximum number of retries
+    Returns:
+        bool: True if successful, False otherwise
     """
+    for attempt in range(max_retries):
+        try:
+            upload_file_with_backoff(
+                api=api,
+                path_or_fileobj=local_path,
+                path_in_repo=repo_path,
+                repo_id=repo_id,
+                repo_type=repo_type,
+                commit_message=commit_message
+            )
+            return True
+        except Exception as e:
+            if attempt < max_retries - 1:
+                # Calculate exponential backoff
+                wait_time = min(UPLOAD_INITIAL_BACKOFF * (2 ** attempt), UPLOAD_MAX_BACKOFF)
+                print(f"      {e} error on attempt {attempt + 1}/{max_retries}. Retrying in {wait_time}s...")
+                time.sleep(wait_time)
+            else:
+                print(f"      Failed after {max_retries} attempts: {str(e)}")
+                return False
+    return False
+def batch_upload_pr_metadata(all_metadata):
+    """
+    Upload PR metadata for all agents with time gaps between uploads.
+    Each agent's data is uploaded as separate daily files with retry logic.
+    Args:
+        all_metadata: Dictionary mapping agent identifier to list of PR metadata
+    Returns:
+        tuple: (success_count, error_count)
+    """
     try:
         token = get_hf_token()
         if not token:
         api = HfApi(token=token)
+        success_count = 0
+        error_count = 0
+        total_files = 0
+        # First, calculate total number of files to upload
+        for agent_identifier, metadata_list in all_metadata.items():
+            if metadata_list:
+                grouped = group_metadata_by_date(metadata_list)
+                total_files += len(grouped)
+        print(f"Uploading {total_files} files for {len(all_metadata)} agents...")
+        file_count = 0
+        for agent_idx, (agent_identifier, metadata_list) in enumerate(all_metadata.items(), 1):
+            if not metadata_list:
+                continue
+            # Group by date
+            grouped = group_metadata_by_date(metadata_list)
+            # Create temporary files for this agent
+            agent_temp_dir = tempfile.mkdtemp()
+            try:
+                # Prepare all files locally
+                local_files = []
+                for (pr_year, month, day), day_metadata in grouped.items():
+                    filename = f"{pr_year}.{month:02d}.{day:02d}.jsonl"
+                    local_path = os.path.join(agent_temp_dir, filename)
+                    repo_path = f"{agent_identifier}/{filename}"
+                    # Sort by created_at for better organization
+                    day_metadata.sort(key=lambda x: x.get('created_at', ''), reverse=True)
+                    # Save to temp file
+                    save_jsonl(local_path, day_metadata)
+                    local_files.append((local_path, repo_path, len(day_metadata)))
+                # Upload each file with delay
+                agent_success = 0
+                agent_error = 0
+                for file_idx, (local_path, repo_path, pr_count) in enumerate(local_files, 1):
+                    file_count += 1
+                    if upload_single_file_with_retry(
+                        api=api,
+                        local_path=local_path,
+                        repo_path=repo_path,
+                        repo_id=PR_METADATA_REPO,
+                        repo_type="dataset",
+                        commit_message=f"Update {repo_path}",
+                        max_retries=MAX_RETRIES
+                    ):
+                        agent_success += 1
+                        success_count += 1
+                    else:
+                        agent_error += 1
+                        error_count += 1
+                    # Add delay between uploads (except for last file)
+                    if file_idx < len(local_files):
+                        time.sleep(UPLOAD_DELAY_SECONDS)
+            finally:
+                # Clean up temp directory
+                if os.path.exists(agent_temp_dir):
+                    import shutil
+                    shutil.rmtree(agent_temp_dir)
+        if error_count > 0:
+            print(f"Upload complete: {success_count}/{total_files} succeeded, {error_count} errors")
+        else:
+            print(f"Upload complete: {success_count}/{total_files} files")
+        return success_count, error_count
     except Exception as e:
+        print(f"Error during batch upload: {str(e)}")
         import traceback
         traceback.print_exc()
+        return 0, total_files if 'total_files' in locals() else 0
 def load_agents_from_hf():
         agents = []
         # List all files in the repository
+        files = list_repo_files_with_backoff(api=api, repo_id=AGENTS_REPO, repo_type="dataset")
         # Filter for JSON files only
         json_files = [f for f in files if f.endswith('.json')]
         # Download and parse each JSON file
         for json_file in json_files:
             try:
                     agents.append(agent_data)
             except Exception as e:
+                print(f"Error loading {json_file}: {str(e)}")
                 continue
+        print(f"Download complete: {len(agents)} agents")
         return agents
     except Exception as e:
     }
+def calculate_monthly_metrics_by_agent(all_metadata_dict, agents):
     """
     Calculate monthly metrics for all agents for visualization.
     Args:
+        all_metadata_dict: Dictionary mapping agent identifier to list of PR metadata
+        agents: List of agent dictionaries with metadata
     Returns:
+        dict: {
+            'agents': list of agent names,
+            'months': list of month labels (e.g., '2025-01'),
+            'data': {
+                agent_name: {
+                    'acceptance_rates': list of acceptance rates by month,
+                    'total_prs': list of PR counts by month,
+                    'merged_prs': list of merged PR counts by month,
+                }
+            }
+        }
     """
     # Create mapping from agent_identifier to agent_name
+    identifier_to_name = {agent.get('github_identifier'): agent.get('name') for agent in agents if agent.get('github_identifier')}
+    if not all_metadata_dict:
+        return {'agents': [], 'months': [], 'data': {}}
     # Group by agent and month
     agent_month_data = defaultdict(lambda: defaultdict(list))
+    # Flatten the dict of lists into a single list with agent_identifier added
+    for agent_identifier, metadata_list in all_metadata_dict.items():
+        for pr_meta in metadata_list:
+            created_at = pr_meta.get('created_at')
+            if not created_at:
+                continue
+            # Get agent_name from identifier
+            agent_name = identifier_to_name.get(agent_identifier, agent_identifier)
+            try:
+                dt = datetime.fromisoformat(created_at.replace('Z', '+00:00'))
+                month_key = f"{dt.year}-{dt.month:02d}"
+                agent_month_data[agent_name][month_key].append(pr_meta)
+            except Exception as e:
+                print(f"Warning: Could not parse date '{created_at}': {e}")
+                continue
     # Get all unique months and sort them
     all_months = set()
     result_data = {}
     for agent_name, month_dict in agent_month_data.items():
         acceptance_rates = []
+        total_prs_list = []
+        merged_prs_list = []
         closed_not_merged_list = []
         for month in months:
             acceptance_rate = (merged_count / total_decisions * 100) if total_decisions > 0 else None
             acceptance_rates.append(acceptance_rate)
+            total_prs_list.append(total_count)
+            merged_prs_list.append(merged_count)
             closed_not_merged_list.append(closed_not_merged_count)
         result_data[agent_name] = {
             'acceptance_rates': acceptance_rates,
+            'total_prs': total_prs_list,
+            'merged_prs': merged_prs_list,
             'closed_not_merged': closed_not_merged_list
         }
     }
+def construct_leaderboard_from_metadata(all_metadata_dict, agents):
     """
+    Construct leaderboard from in-memory PR metadata.
     Args:
+        all_metadata_dict: Dictionary mapping agent identifier to list of PR metadata
+        agents: List of agent dictionaries with metadata
     Returns:
+        Dictionary of agent stats.
     """
+    if not agents:
+        print("Error: No agents found")
+        return {}
     cache_dict = {}
     for agent in agents:
         identifier = agent.get('github_identifier')
         agent_name = agent.get('name', 'Unknown')
+        # Get metadata for this agent from the dictionary
+        bot_metadata = all_metadata_dict.get(identifier, [])
         # Calculate stats
         stats = calculate_pr_stats_from_metadata(bot_metadata)
         cache_dict[identifier] = {
             'name': agent_name,
+            'website': agent.get('website', 'N/A'),
             'github_identifier': identifier,
             **stats
         }
     return cache_dict
+def save_leaderboard_data_to_hf(leaderboard_dict, monthly_metrics):
     """
+    Save leaderboard data and monthly metrics to HuggingFace dataset as swe-pr.json.
     Args:
+        leaderboard_dict: Dictionary of agent stats from construct_leaderboard_from_metadata()
+        monthly_metrics: Monthly metrics data from calculate_monthly_metrics_by_agent()
     Returns:
+        bool: True if successful, False otherwise
     """
     try:
         token = get_hf_token()
             raise Exception("No HuggingFace token found")
         api = HfApi(token=token)
+        filename = "swe-pr.json"
+        # Combine leaderboard and monthly metrics
         combined_data = {
+            'last_updated': datetime.now(timezone.utc).isoformat(),
+            'leaderboard': leaderboard_dict,
             'monthly_metrics': monthly_metrics,
+            'metadata': {
+                'leaderboard_time_frame_days': LEADERBOARD_TIME_FRAME_DAYS
+            }
         }
+        # Save locally first
+        with open(filename, 'w') as f:
+            json.dump(combined_data, f, indent=2)
+        try:
+            # Upload to HuggingFace with retry logic
             upload_file_with_backoff(
+                api=api,
+                path_or_fileobj=filename,
+                path_in_repo=filename,
                 repo_id=LEADERBOARD_REPO,
                 repo_type="dataset"
             )
             return True
         finally:
+            # Always clean up local file
+            if os.path.exists(filename):
+                os.remove(filename)
     except Exception as e:
+        print(f"Error saving leaderboard data: {str(e)}")
         import traceback
         traceback.print_exc()
         return False
 def mine_all_agents():
     """
     Mine PR metadata for all agents within LEADERBOARD_TIME_FRAME_DAYS and save to HuggingFace.
+    Downloads GHArchive data first, then uses ONE DuckDB query for ALL agents, then batch uploads with time gaps.
     """
+    # Step 1: Download GHArchive data
+    print(f"\n[1/5] Downloading GHArchive data...")
+    if not download_all_gharchive_data():
+        print("Warning: Download had errors, continuing with available data...")
+    # Step 2: Load agent metadata from HuggingFace
+    print(f"\n[2/5] Loading agent metadata...")
     agents = load_agents_from_hf()
     if not agents:
+        print("Error: No agents found")
         return
     # Extract all identifiers
     identifiers = [agent['github_identifier'] for agent in agents if agent.get('github_identifier')]
     if not identifiers:
+        print("Error: No valid agent identifiers found")
         return
+    print(f"\n[3/5] Mining PR metadata ({len(identifiers)} agents, {LEADERBOARD_TIME_FRAME_DAYS} days)...")
+    # Initialize DuckDB connection
     try:
+        conn = get_duckdb_connection()
     except Exception as e:
+        print(f"Failed to initialize DuckDB connection: {str(e)}")
         return
     # Define time range: past LEADERBOARD_TIME_FRAME_DAYS (excluding today)
     start_date = end_date - timedelta(days=LEADERBOARD_TIME_FRAME_DAYS)
     try:
+        # Use single query for all agents
+        all_metadata = fetch_all_pr_metadata_single_query(
+            conn, identifiers, start_date, end_date
         )
         # Calculate summary statistics
         total_prs = sum(len(metadata_list) for metadata_list in all_metadata.values())
         agents_with_data = sum(1 for metadata_list in all_metadata.values() if metadata_list)
+        print(f"Query complete: {total_prs} PRs found for {agents_with_data}/{len(agents)} agents")
     except Exception as e:
+        print(f"Error during DuckDB fetch: {str(e)}")
         import traceback
         traceback.print_exc()
         return
+    finally:
+        # Close DuckDB connection
+        conn.close()
+    # Step 4: Batch upload PR metadata with time gaps
+    print(f"\n[4/5] Uploading PR metadata...")
+    success_count, error_count = batch_upload_pr_metadata(all_metadata)
+    # Step 5: Construct and save leaderboard data
+    print(f"\n[5/5] Saving leaderboard...")
     try:
+        # Construct leaderboard from in-memory data
+        leaderboard_dict = construct_leaderboard_from_metadata(all_metadata, agents)
+        # Calculate monthly metrics from in-memory data
+        monthly_metrics = calculate_monthly_metrics_by_agent(all_metadata, agents)
+        # Save to HuggingFace
+        save_leaderboard_data_to_hf(leaderboard_dict, monthly_metrics)
+        print(f"\nCOMPLETE: {success_count} files uploaded" + (f", {error_count} errors" if error_count > 0 else ""))
     except Exception as e:
+        print(f"Error saving leaderboard: {str(e)}")
         import traceback
         traceback.print_exc()
+# =============================================================================
+# SCHEDULER SETUP
+# =============================================================================
+def setup_scheduler():
+    """
+    Set up APScheduler to run mining jobs periodically.
+    Schedule is configurable via environment variables.
+    Environment variables:
+    - SCHEDULE_ENABLED: Enable/disable scheduler (default: true)
+    - SCHEDULE_DAY_OF_MONTH: Day of month to run (default: 8, second week)
+    - SCHEDULE_HOUR: Hour to run (0-23, default: 0)
+    - SCHEDULE_MINUTE: Minute to run (0-59, default: 0)
+    - SCHEDULE_TIMEZONE: Timezone for scheduling (default: UTC)
+    """
+    # Configure logging for APScheduler
+    logging.basicConfig(
+        level=logging.INFO,
+        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+    )
+    # Disable verbose HTTP request logging from httpx (used by huggingface_hub)
+    logging.getLogger('httpx').setLevel(logging.WARNING)
+    # Create scheduler
+    scheduler = BlockingScheduler(timezone=SCHEDULE_TIMEZONE)
+    # Create cron trigger with configured schedule (monthly on specific day)
+    trigger = CronTrigger(
+        day=SCHEDULE_DAY_OF_MONTH,
+        hour=SCHEDULE_HOUR,
+        minute=SCHEDULE_MINUTE,
+        timezone=SCHEDULE_TIMEZONE
+    )
+    # Add job to scheduler
+    scheduler.add_job(
+        mine_all_agents,
+        trigger=trigger,
+        id='mine_all_agents',
+        name='Mine GHArchive data for all agents',
+        replace_existing=True
+    )
+    # Print schedule information
+    from datetime import datetime
+    next_run = trigger.get_next_fire_time(None, datetime.now(trigger.timezone))
+    print(f"Scheduler: Monthly on day {SCHEDULE_DAY_OF_MONTH} at {SCHEDULE_HOUR:02d}:{SCHEDULE_MINUTE:02d} {SCHEDULE_TIMEZONE}")
+    print(f"Next run: {next_run}\n")
+    # Start scheduler (blocking call)
+    print(f"\nScheduler started")
+    scheduler.start()
 # =============================================================================
 # ENTRY POINT
 # =============================================================================
 if __name__ == "__main__":
+    if SCHEDULE_ENABLED:
+        # Run with scheduler
+        setup_scheduler()
+    else:
+        # Run without scheduler, just mine once
+        mine_all_agents()

requirements.txt CHANGED Viewed

@@ -1,12 +1,10 @@
 APScheduler
 backoff
-datasets
-db-dtypes
-google-cloud-bigquery
 gradio
 gradio_leaderboard
 huggingface_hub
 pandas
 plotly
-PyGithub
-python-dotenv

 APScheduler
 backoff
+duckdb[all]
 gradio
 gradio_leaderboard
 huggingface_hub
 pandas
 plotly
+python-dotenv
+requests