Spaces:

SWE-Arena
/

SWE-PR

Running

Claude commited on Nov 8

Commit

d40e922

unverified ·

1 Parent(s): cbe6e22

Refactor app.py to use BigQuery for PR mining

Major changes:
- Remove debug mode and argparse completely
- Add BigQuery client and mining functions
- Remove GitHub API complexity (TokenPool, rate limiting, time partitioning)
- Replace daily incremental updates with weekly BigQuery mining
- Add UPDATE_TIME_FRAME_DAYS configuration (30 days)
- Update save_pr_metadata_to_hf to use upload_large_folder with complete overwrite
- Add top_n filtering to monthly metrics (show top 5 agents)
- Replace hardcoded colors with HSL color generation
- Replace hardcoded month references with LEADERBOARD_TIME_FRAME_DAYS // 30
- Rename get_daily_files_last_n_months to get_daily_files_last_time_frame
- Clean up unused imports (remove threading, datasets)

This makes the app more efficient by using BigQuery for batch mining instead of
GitHub API rate-limited incremental updates.

Files changed (1) hide show

app.py +444 -1226

app.py CHANGED Viewed

@@ -1,54 +1,33 @@
 import gradio as gr
-from gradio_leaderboard import Leaderboard
 import json
 import os
 import time
 import requests
 from datetime import datetime, timezone, timedelta
 from collections import defaultdict
 from huggingface_hub import HfApi, hf_hub_download
-from datasets import load_dataset, Dataset
-import threading
 from dotenv import load_dotenv
 import pandas as pd
 import random
-import argparse
 import plotly.graph_objects as go
 from plotly.subplots import make_subplots
 from apscheduler.schedulers.background import BackgroundScheduler
 from apscheduler.triggers.cron import CronTrigger
 # Load environment variables
 load_dotenv()
-# Parse command-line arguments
-parser = argparse.ArgumentParser(description='SWE Agent PR Leaderboard')
-parser.add_argument('--debug', '--DEBUG', action='store_true',
-                    help='Enable debug mode (limits PR retrieval to 10 per query pattern)')
-parser.add_argument('--no-debug', '--production', action='store_true',
-                    help='Explicitly disable debug mode (force production mode)')
-args = parser.parse_args()
 # =============================================================================
 # CONFIGURATION
 # =============================================================================
-# DEBUG MODE: Set to True to limit PR retrieval for testing
-# When enabled, only fetches up to 10 PRs per query pattern per agent
-# Priority: 1) Command-line args, 2) Environment variable, 3) Default (False)
-if args.no_debug:
-    DEBUG_MODE = False
-elif args.debug:
-    DEBUG_MODE = True
-else:
-    DEBUG_MODE = os.getenv('DEBUG_MODE', 'False').lower() in ('true', '1', 'yes')
-# In-memory cache for debug mode (data persists during session but NOT saved to HF)
-DEBUG_PR_METADATA_CACHE = defaultdict(list)
 AGENTS_REPO = "SWE-Arena/swe_agents"  # HuggingFace dataset for agent metadata
 PR_METADATA_REPO = "SWE-Arena/pr_metadata"  # HuggingFace dataset for PR metadata
-LEADERBOARD_TIME_FRAME_DAYS = 180  # Time frame for leaderboard (past 6 months)
 LEADERBOARD_COLUMNS = [
     ("Agent Name", "string"),
@@ -66,7 +45,7 @@ def load_jsonl(filename):
     """Load JSONL file and return list of dictionaries."""
     if not os.path.exists(filename):
         return []
     data = []
     with open(filename, 'r', encoding='utf-8') as f:
         for line in f:
@@ -87,784 +66,286 @@ def save_jsonl(filename, data):
             f.write(json.dumps(item) + '\n')
-def cache_to_dict(cache_list):
-    """Convert list of cache entries to dictionary by identifier."""
-    return {entry['github_identifier']: entry for entry in cache_list}
-def dict_to_cache(cache_dict):
-    """Convert dictionary back to list of values."""
-    return list(cache_dict.values())
-def normalize_date_format(date_string):
-    """
-    Convert date strings to standardized ISO 8601 format with Z suffix.
-    Handles both old format (2025-10-15T23:23:47.983068) and new format (2025-10-15T23:23:47Z).
-    """
-    if not date_string or date_string == 'N/A':
-        return 'N/A'
-    try:
-        # Parse the date string (handles both with and without microseconds)
-        if '.' in date_string:
-            # Old format with microseconds
-            dt = datetime.fromisoformat(date_string.replace('Z', '+00:00'))
-        else:
-            # Already in correct format or GitHub format
-            return date_string
-        # Convert to standardized format
-        return dt.strftime('%Y-%m-%dT%H:%M:%SZ')
-    except Exception as e:
-        print(f"Warning: Could not parse date '{date_string}': {e}")
-        return date_string
 # =============================================================================
-# GITHUB API OPERATIONS
 # =============================================================================
-def request_with_backoff(method, url, *, headers=None, params=None, json_body=None, data=None, max_retries=10, timeout=30, token_pool=None, token=None):
     """
-    Perform an HTTP request with exponential backoff and jitter for GitHub API.
-    Retries on 403/429 (rate limits), 5xx server errors, and transient network exceptions.
-    Args:
-        token_pool: Optional TokenPool instance for marking rate-limited tokens
-        token: Optional token string used for this request (for rate limit tracking)
-    Returns the final requests.Response on success or non-retryable status, or None after exhausting retries.
     """
-    delay = 1.0
-    for attempt in range(max_retries):
-        try:
-            resp = requests.request(
-                method,
-                url,
-                headers=headers or {},
-                params=params,
-                json=json_body,
-                data=data,
-                timeout=timeout
-            )
-            status = resp.status_code
-            # Success
-            if 200 <= status < 300:
-                return resp
-            # Rate limits or server errors -> retry with backoff
-            if status in (403, 429) or 500 <= status < 600:
-                wait = None
-                reset_timestamp = None
-                # Prefer Retry-After when present
-                retry_after = resp.headers.get('Retry-After') or resp.headers.get('retry-after')
-                if retry_after:
-                    try:
-                        wait = float(retry_after)
-                    except Exception:
-                        wait = None
-                # Fallback to X-RateLimit-Reset when 403/429
-                if wait is None and status in (403, 429):
-                    reset_hdr = resp.headers.get('X-RateLimit-Reset') or resp.headers.get('x-ratelimit-reset')
-                    if reset_hdr:
-                        try:
-                            reset_ts = int(float(reset_hdr))
-                            reset_timestamp = reset_ts
-                            wait = max(reset_ts - time.time() + 2, 1)
-                        except Exception:
-                            wait = None
-                # Final fallback: exponential backoff with jitter
-                if wait is None:
-                    wait = delay + random.uniform(0, 0.5)
-                # Mark token as rate-limited if we have token pool and token info
-                if status in (403, 429) and token_pool and token:
-                    token_pool.mark_rate_limited(token, reset_timestamp)
-                # Cap individual wait to avoid extreme sleeps
-                wait = max(1.0, min(wait, 120.0))
-                print(f"GitHub API {status}. Backing off {wait:.1f}s (attempt {attempt + 1}/{max_retries})...")
-                time.sleep(wait)
-                delay = min(delay * 2, 60.0)
-                continue
-            # Non-retryable error; return response for caller to handle
-            return resp
-        except requests.RequestException as e:
-            # Network error -> retry with backoff
-            wait = delay + random.uniform(0, 0.5)
-            wait = max(1.0, min(wait, 60.0))
-            print(f"Request error: {e}. Retrying in {wait:.1f}s (attempt {attempt + 1}/{max_retries})...")
-            time.sleep(wait)
-            delay = min(delay * 2, 60.0)
-    print(f"Exceeded max retries for {url}")
-    return None
-def get_github_tokens():
-    """Get all GitHub tokens from environment variables (all vars starting with GITHUB_TOKEN)."""
-    tokens = []
-    for key, value in os.environ.items():
-        if key.startswith('GITHUB_TOKEN') and value:
-            tokens.append(value)
-    if not tokens:
-        print("Warning: No GITHUB_TOKEN* found. API rate limits: 60/hour (authenticated: 5000/hour)")
     else:
-        print(f"✓ Loaded {len(tokens)} GitHub token(s) for token pool")
-    return tokens
-def get_github_token():
-    """Get primary GitHub token from environment variables (for backward compatibility)."""
-    token = os.getenv('GITHUB_TOKEN')
-    if not token:
-        print("Warning: GITHUB_TOKEN not found. API rate limits: 60/hour (authenticated: 5000/hour)")
-    return token
-class TokenPool:
-    """
-    Hybrid token pool that manages GitHub tokens with parallel execution and round-robin fallback.
-    Strategy:
-    - 50% of tokens allocated to parallel pool (for concurrent API calls)
-    - 50% of tokens allocated to round-robin pool (for rate limit fallback)
-    - Automatically switches to round-robin when parallel tokens hit rate limits
-    - Thread-safe for concurrent access
-    """
-    def __init__(self, tokens):
-        import threading
-        self.all_tokens = tokens if tokens else [None]
-        self.lock = threading.Lock()
-        # Split tokens into parallel and round-robin pools (50/50)
-        total_tokens = len(self.all_tokens)
-        split_point = max(1, total_tokens // 2)  # At least 1 token in each pool
-        self.parallel_tokens = self.all_tokens[:split_point]
-        self.roundrobin_tokens = self.all_tokens[split_point:]
-        # If only 1 token, use it in both pools
-        if total_tokens == 1:
-            self.parallel_tokens = self.all_tokens
-            self.roundrobin_tokens = self.all_tokens
-        # Track rate-limited tokens with reset times
-        self.rate_limited_parallel = {}  # {token: reset_timestamp}
-        self.rate_limited_roundrobin = {}  # {token: reset_timestamp}
-        # Round-robin index for fallback pool
-        self.roundrobin_index = 0
-        # Statistics
-        self.parallel_calls = 0
-        self.roundrobin_calls = 0
-        self.fallback_triggers = 0
-        print(f"🔄 Hybrid Token Pool initialized:")
-        print(f"   Total tokens: {total_tokens}")
-        print(f"   Parallel pool: {len(self.parallel_tokens)} token(s)")
-        print(f"   Round-robin pool: {len(self.roundrobin_tokens)} token(s)")
-    def _clean_expired_rate_limits(self):
-        """Remove tokens from rate limit tracking if their reset time has passed."""
-        current_time = time.time()
-        # Clean parallel pool
-        expired_parallel = [token for token, reset_time in self.rate_limited_parallel.items()
-                           if current_time >= reset_time]
-        for token in expired_parallel:
-            del self.rate_limited_parallel[token]
-        # Clean round-robin pool
-        expired_roundrobin = [token for token, reset_time in self.rate_limited_roundrobin.items()
-                             if current_time >= reset_time]
-        for token in expired_roundrobin:
-            del self.rate_limited_roundrobin[token]
-    def get_parallel_token(self):
-        """
-        Get a token from the parallel pool for concurrent execution.
-        Returns None if all parallel tokens are rate-limited.
-        """
-        with self.lock:
-            self._clean_expired_rate_limits()
-            # Find first non-rate-limited token in parallel pool
-            for token in self.parallel_tokens:
-                if token not in self.rate_limited_parallel:
-                    self.parallel_calls += 1
-                    return token
-            return None  # All parallel tokens are rate-limited
-    def get_available_parallel_tokens(self):
-        """
-        Get all available tokens from parallel pool (not rate-limited).
-        Used for batch parallel execution.
-        """
-        with self.lock:
-            self._clean_expired_rate_limits()
-            available = [token for token in self.parallel_tokens
-                        if token not in self.rate_limited_parallel]
-            return available
-    def get_roundrobin_token(self):
-        """
-        Get the next token from round-robin pool (fallback mechanism).
-        Skips rate-limited tokens and rotates to the next available one.
-        """
-        with self.lock:
-            self._clean_expired_rate_limits()
-            attempts = 0
-            max_attempts = len(self.roundrobin_tokens)
-            while attempts < max_attempts:
-                token = self.roundrobin_tokens[self.roundrobin_index]
-                self.roundrobin_index = (self.roundrobin_index + 1) % len(self.roundrobin_tokens)
-                if token not in self.rate_limited_roundrobin:
-                    self.roundrobin_calls += 1
-                    return token
-                attempts += 1
-            # All round-robin tokens are rate-limited
-            return None
-    def get_next_token(self):
-        """
-        Get the next available token (try parallel first, fallback to round-robin).
-        This is the main method for backwards compatibility.
-        """
-        # Try parallel pool first
-        token = self.get_parallel_token()
-        if token:
-            return token
-        # Fallback to round-robin
-        with self.lock:
-            self.fallback_triggers += 1
-        token = self.get_roundrobin_token()
-        if token:
-            return token
-        # All tokens exhausted - return first parallel token anyway (will hit rate limit)
-        return self.parallel_tokens[0] if self.parallel_tokens else None
-    def get_headers(self):
-        """Get headers with the next available token."""
-        token = self.get_next_token()
-        return {'Authorization': f'token {token}'} if token else {}
-    def mark_rate_limited(self, token, reset_timestamp=None):
-        """
-        Mark a token as rate-limited with optional reset timestamp.
-        Args:
-            token: The token that hit rate limit
-            reset_timestamp: Unix timestamp when rate limit resets (optional)
-        """
-        with self.lock:
-            # Default to 1 hour from now if no reset time provided
-            if reset_timestamp is None:
-                reset_timestamp = time.time() + 3600
-            # Mark in appropriate pool
-            if token in self.parallel_tokens:
-                self.rate_limited_parallel[token] = reset_timestamp
-                print(f"   ⚠️ Parallel token marked as rate-limited until {datetime.fromtimestamp(reset_timestamp, timezone.utc).strftime('%H:%M:%S UTC')}")
-            if token in self.roundrobin_tokens:
-                self.rate_limited_roundrobin[token] = reset_timestamp
-                print(f"   ⚠️ Round-robin token marked as rate-limited until {datetime.fromtimestamp(reset_timestamp, timezone.utc).strftime('%H:%M:%S UTC')}")
-    def get_stats(self):
-        """Get usage statistics for monitoring."""
-        with self.lock:
-            return {
-                'parallel_calls': self.parallel_calls,
-                'roundrobin_calls': self.roundrobin_calls,
-                'fallback_triggers': self.fallback_triggers,
-                'parallel_rate_limited': len(self.rate_limited_parallel),
-                'roundrobin_rate_limited': len(self.rate_limited_roundrobin)
-            }
-    def print_stats(self):
-        """Print usage statistics."""
-        stats = self.get_stats()
-        total_calls = stats['parallel_calls'] + stats['roundrobin_calls']
-        if total_calls > 0:
-            print(f"\n📊 Token Pool Statistics:")
-            print(f"   Total API calls: {total_calls}")
-            print(f"   Parallel calls: {stats['parallel_calls']} ({stats['parallel_calls']/total_calls*100:.1f}%)")
-            print(f"   Round-robin calls: {stats['roundrobin_calls']} ({stats['roundrobin_calls']/total_calls*100:.1f}%)")
-            print(f"   Fallback triggers: {stats['fallback_triggers']}")
-            print(f"   Currently rate-limited: {stats['parallel_rate_limited']} parallel, {stats['roundrobin_rate_limited']} round-robin")
-def validate_github_username(identifier):
-    """Verify that a GitHub identifier exists with backoff-aware requests."""
-    try:
-        token = get_github_token()
-        headers = {'Authorization': f'token {token}'} if token else {}
-        url = f'https://api.github.com/users/{identifier}'
-        response = request_with_backoff('GET', url, headers=headers, max_retries=1,
-                                       token_pool=None, token=token)
-        if response is None:
-            return False, "Validation error: network/rate limit exhausted"
-        if response.status_code == 200:
-            return True, "Username is valid"
-        elif response.status_code == 404:
-            return False, "GitHub identifier not found"
-        else:
-            return False, f"Validation error: HTTP {response.status_code}"
-    except Exception as e:
-        return False, f"Validation error: {str(e)}"
-def fetch_prs_with_time_partition(base_query, start_date, end_date, token_pool, prs_by_id, debug_limit=None, depth=0):
     """
-    Fetch PRs within a specific time range using time-based partitioning.
-    Recursively splits the time range if hitting the 1000-result limit.
-    Supports splitting by day, hour, minute, and second as needed.
     Args:
-        token_pool: TokenPool instance for rotating tokens
-        debug_limit: If set, stops fetching after this many PRs (for testing)
-        depth: Current recursion depth (for tracking)
-    Returns the number of PRs found in this time partition.
     """
-    # Calculate time difference
-    time_diff = end_date - start_date
-    total_seconds = time_diff.total_seconds()
-    # Determine granularity and format dates accordingly
-    if total_seconds >= 86400:  # >= 1 day
-        # Use day granularity (YYYY-MM-DD)
-        start_str = start_date.strftime('%Y-%m-%d')
-        end_str = end_date.strftime('%Y-%m-%d')
-    elif total_seconds >= 3600:  # >= 1 hour but < 1 day
-        # Use hour granularity (YYYY-MM-DDTHH:MM:SSZ)
-        start_str = start_date.strftime('%Y-%m-%dT%H:00:00Z')
-        end_str = end_date.strftime('%Y-%m-%dT%H:59:59Z')
-    elif total_seconds >= 60:  # >= 1 minute but < 1 hour
-        # Use minute granularity (YYYY-MM-DDTHH:MM:SSZ)
-        start_str = start_date.strftime('%Y-%m-%dT%H:%M:00Z')
-        end_str = end_date.strftime('%Y-%m-%dT%H:%M:59Z')
-    else:  # < 1 minute
-        # Use second granularity (YYYY-MM-DDTHH:MM:SSZ)
-        start_str = start_date.strftime('%Y-%m-%dT%H:%M:%SZ')
-        end_str = end_date.strftime('%Y-%m-%dT%H:%M:%SZ')
-    # Add date range to query
-    query = f'{base_query} created:{start_str}..{end_str}'
-    indent = "  " + "  " * depth
-    print(f"{indent}Searching range {start_str} to {end_str}...")
-    page = 1
-    per_page = 100
-    total_in_partition = 0
-    while True:
-        # Check debug limit
-        if debug_limit is not None and total_in_partition >= debug_limit:
-            print(f"{indent}  🐛 DEBUG MODE: Reached limit of {debug_limit} PRs, stopping...")
-            return total_in_partition
-        url = 'https://api.github.com/search/issues'
-        params = {
-            'q': query,
-            'per_page': per_page,
-            'page': page,
-            'sort': 'created',
-            'order': 'asc'
-        }
-        try:
-            # Get token for tracking
-            token = token_pool.get_next_token()
-            headers = {'Authorization': f'token {token}'} if token else {}
-            response = request_with_backoff('GET', url, headers=headers, params=params,
-                                           token_pool=token_pool, token=token)
-            if response is None:
-                print(f"{indent}  Error: retries exhausted for range {start_str} to {end_str}")
-                return total_in_partition
-            if response.status_code != 200:
-                print(f"{indent}  Error: HTTP {response.status_code} for range {start_str} to {end_str}")
-                return total_in_partition
-            data = response.json()
-            total_count = data.get('total_count', 0)
-            items = data.get('items', [])
-            if not items:
-                break
-            # Add PRs to global dict
-            for pr in items:
-                pr_id = pr.get('id')
-                if pr_id and pr_id not in prs_by_id:
-                    prs_by_id[pr_id] = pr
-                    total_in_partition += 1
-            # Check if we hit the 1000-result limit
-            if total_count > 1000 and page == 10:
-                print(f"{indent}  ⚠️ Hit 1000-result limit ({total_count} total). Splitting time range...")
-                # Determine how to split based on time range duration
-                if total_seconds < 2:  # Less than 2 seconds - can't split further
-                    print(f"{indent}  ⚠️ Cannot split further (range < 2 seconds). Some results may be missing.")
-                    break
-                elif total_seconds < 120:  # Less than 2 minutes - split by seconds
-                    # Split into 2-4 parts depending on range
-                    num_splits = min(4, max(2, int(total_seconds / 30)))
-                    split_duration = time_diff / num_splits
-                    split_dates = [start_date + split_duration * i for i in range(num_splits + 1)]
-                    total_from_splits = 0
-                    for i in range(num_splits):
-                        split_start = split_dates[i]
-                        split_end = split_dates[i + 1]
-                        # Avoid overlapping ranges (add 1 second to start)
-                        if i > 0:
-                            split_start = split_start + timedelta(seconds=1)
-                        count = fetch_prs_with_time_partition(
-                            base_query, split_start, split_end, token_pool, prs_by_id, debug_limit, depth + 1
-                        )
-                        total_from_splits += count
-                    return total_from_splits
-                elif total_seconds < 7200:  # Less than 2 hours - split by minutes
-                    # Split into 2-4 parts
-                    num_splits = min(4, max(2, int(total_seconds / 1800)))
-                    split_duration = time_diff / num_splits
-                    split_dates = [start_date + split_duration * i for i in range(num_splits + 1)]
-                    total_from_splits = 0
-                    for i in range(num_splits):
-                        split_start = split_dates[i]
-                        split_end = split_dates[i + 1]
-                        # Avoid overlapping ranges (add 1 minute to start)
-                        if i > 0:
-                            split_start = split_start + timedelta(minutes=1)
-                        count = fetch_prs_with_time_partition(
-                            base_query, split_start, split_end, token_pool, prs_by_id, debug_limit, depth + 1
-                        )
-                        total_from_splits += count
-                    return total_from_splits
-                elif total_seconds < 172800:  # Less than 2 days - split by hours
-                    # Split into 2-4 parts
-                    num_splits = min(4, max(2, int(total_seconds / 43200)))
-                    split_duration = time_diff / num_splits
-                    split_dates = [start_date + split_duration * i for i in range(num_splits + 1)]
-                    total_from_splits = 0
-                    for i in range(num_splits):
-                        split_start = split_dates[i]
-                        split_end = split_dates[i + 1]
-                        # Avoid overlapping ranges (add 1 hour to start)
-                        if i > 0:
-                            split_start = split_start + timedelta(hours=1)
-                        count = fetch_prs_with_time_partition(
-                            base_query, split_start, split_end, token_pool, prs_by_id, debug_limit, depth + 1
-                        )
-                        total_from_splits += count
-                    return total_from_splits
-                else:  # 2+ days - split by days
-                    days_diff = time_diff.days
-                    # Use aggressive splitting for large ranges or deep recursion
-                    # Split into 4 parts if range is > 30 days, otherwise split in half
-                    if days_diff > 30 or depth > 5:
-                        # Split into 4 parts for more aggressive partitioning
-                        quarter_diff = time_diff / 4
-                        split_dates = [
-                            start_date,
-                            start_date + quarter_diff,
-                            start_date + quarter_diff * 2,
-                            start_date + quarter_diff * 3,
-                            end_date
-                        ]
-                        total_from_splits = 0
-                        for i in range(4):
-                            split_start = split_dates[i]
-                            split_end = split_dates[i + 1]
-                            # Avoid overlapping ranges
-                            if i > 0:
-                                split_start = split_start + timedelta(days=1)
-                            count = fetch_prs_with_time_partition(
-                                base_query, split_start, split_end, token_pool, prs_by_id, debug_limit, depth + 1
-                            )
-                            total_from_splits += count
-                        return total_from_splits
-                    else:
-                        # Binary split for smaller ranges
-                        mid_date = start_date + time_diff / 2
-                        # Recursively fetch both halves
-                        count1 = fetch_prs_with_time_partition(
-                            base_query, start_date, mid_date, token_pool, prs_by_id, debug_limit, depth + 1
-                        )
-                        count2 = fetch_prs_with_time_partition(
-                            base_query, mid_date + timedelta(days=1), end_date, token_pool, prs_by_id, debug_limit, depth + 1
-                        )
-                        return count1 + count2
-            # Normal pagination: check if there are more pages
-            if len(items) < per_page or page >= 10:
-                break
-            page += 1
-            time.sleep(0.5)  # Courtesy delay between pages
-        except Exception as e:
-            print(f"{indent}  Error fetching range {start_str} to {end_str}: {str(e)}")
-            return total_in_partition
-    if total_in_partition > 0:
-        print(f"{indent}  ✓ Found {total_in_partition} PRs in range {start_str} to {end_str}")
-    return total_in_partition
-def extract_pr_metadata(pr):
     """
-    Extract minimal PR metadata for efficient storage.
-    Only keeps essential fields: html_url, created_at, merged_at, closed_at.
-    Note: agent_name is not stored as it's inferred from the folder structure.
-    """
-    pull_request = pr.get('pull_request', {})
-    # Extract dates
-    created_at = pr.get('created_at')
-    merged_at = pull_request.get('merged_at')
-    closed_at = pr.get('closed_at')
-    # Only store closed_at if PR is closed but not merged
-    if merged_at:
-        closed_at = None  # Don't store redundant info
-    return {
-        'html_url': pr.get('html_url'),
-        'created_at': created_at,
-        'merged_at': merged_at,
-        'closed_at': closed_at
-    }
-def fetch_prs_parallel(query_patterns, start_date, end_date, token_pool, max_workers=None):
-    """
-    Fetch PRs for multiple query patterns in parallel using available tokens.
     Args:
-        query_patterns: List of query pattern strings
-        start_date: Start date for PR search
-        end_date: End date for PR search
-        token_pool: TokenPool instance
-        max_workers: Maximum number of concurrent workers (defaults to number of available parallel tokens)
     Returns:
-        Dictionary mapping query pattern to list of PRs found
     """
-    import concurrent.futures
-    # Determine number of workers based on available parallel tokens
-    available_tokens = token_pool.get_available_parallel_tokens()
-    if not available_tokens:
-        # Fall back to sequential if no parallel tokens available
-        print("   ⚠️ No parallel tokens available, using sequential fallback")
-        return None
-    if max_workers is None:
-        max_workers = len(available_tokens)
-    print(f"   🚀 Starting parallel execution with {max_workers} worker(s)")
-    results = {}
-    def fetch_single_pattern(pattern):
-        """Fetch PRs for a single query pattern."""
-        prs_by_id = {}
-        try:
-            prs_found = fetch_prs_with_time_partition(
-                pattern,
-                start_date,
-                end_date,
-                token_pool,
-                prs_by_id,
-                debug_limit=None
-            )
-            return pattern, prs_by_id
-        except Exception as e:
-            print(f"   ✗ Error in parallel fetch for pattern '{pattern}': {str(e)}")
-            return pattern, {}
-    # Execute patterns in parallel
-    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
-        # Submit all tasks
-        future_to_pattern = {
-            executor.submit(fetch_single_pattern, pattern): pattern
-            for pattern in query_patterns
-        }
-        # Collect results as they complete
-        for future in concurrent.futures.as_completed(future_to_pattern):
-            pattern = future_to_pattern[future]
-            try:
-                pattern_key, prs = future.result()
-                results[pattern_key] = prs
-                print(f"   ✓ Parallel fetch completed for pattern: {pattern_key}")
-            except Exception as e:
-                print(f"   ✗ Parallel fetch failed for pattern '{pattern}': {str(e)}")
-                results[pattern] = {}
-    return results
-def fetch_daily_prs_metadata(identifier, agent_name, token_pool=None, target_date=None, use_parallel=True):
-    """
-    Fetch pull requests for a specific date (used for daily incremental updates).
-    Args:
-        identifier: GitHub username or bot identifier
-        agent_name: Human-readable name of the agent for metadata purposes
-        token_pool: TokenPool instance for rotating tokens
-        target_date: Date object for which to fetch PRs (defaults to yesterday)
-    Returns:
-        List of dictionaries containing minimal PR metadata for that date
-    """
-    if target_date is None:
-        target_date = (datetime.now(timezone.utc) - timedelta(days=1)).date()
-    # Debug mode: limit PR retrieval for testing
-    debug_limit_per_pattern = 10 if DEBUG_MODE else None
-    if DEBUG_MODE:
-        print(f"\n🐛 DEBUG MODE ENABLED: Limiting to {debug_limit_per_pattern} PRs per query pattern")
-    # Define query patterns per rules:
-    # 1) author pattern only if identifier contains "[bot]"
-    # 2) co-author and head patterns use identifier with "[bot]" removed
-    stripped_id = identifier.replace('[bot]', '')
-    query_patterns = []
-    if '[bot]' in identifier:
-        query_patterns.append(f'is:pr author:{identifier}')
-    if stripped_id:
-        query_patterns.append(f'is:pr "co-authored-by: {stripped_id}"')
-        query_patterns.append(f'is:pr head:{stripped_id}/')
-    # Use a dict to deduplicate PRs by ID
-    prs_by_id = {}
-    # Convert target_date to datetime for API queries
-    start_date = datetime.combine(target_date, datetime.min.time()).replace(tzinfo=timezone.utc)
-    end_date = datetime.combine(target_date, datetime.max.time()).replace(tzinfo=timezone.utc)
-    # Try parallel execution first if enabled
-    if use_parallel and not DEBUG_MODE and len(query_patterns) > 1:
-        print(f"\n🚀 Attempting parallel execution for {len(query_patterns)} query patterns...")
-        parallel_start_time = time.time()
-        parallel_results = fetch_prs_parallel(query_patterns, start_date, end_date, token_pool)
-        if parallel_results is not None:
-            # Merge results from parallel execution
-            for pattern, pattern_prs in parallel_results.items():
-                for pr_id, pr in pattern_prs.items():
-                    if pr_id not in prs_by_id:
-                        prs_by_id[pr_id] = pr
-            parallel_duration = time.time() - parallel_start_time
-            print(f"\n   ✅ Parallel execution complete: {len(prs_by_id)} unique PRs found")
-            print(f"   ⏱️ Total time: {parallel_duration:.1f} seconds")
-            # Print token pool statistics
-            token_pool.print_stats()
-        else:
-            # Fallback to sequential execution
-            print("   ⚠️ Parallel execution not available, falling back to sequential...")
-            use_parallel = False
-    # Sequential execution (fallback or if parallel disabled)
-    if not use_parallel or DEBUG_MODE or len(query_patterns) <= 1:
-        for query_pattern in query_patterns:
-            print(f"\n🔍 Searching with query: {query_pattern}")
-            print(f"   Date: {target_date.strftime('%Y-%m-%d')}")
-            pattern_start_time = time.time()
-            initial_count = len(prs_by_id)
-            # Fetch with time partitioning (for single day)
-            prs_found = fetch_prs_with_time_partition(
-                query_pattern,
-                start_date,
-                end_date,
-                token_pool,
-                prs_by_id,
-                debug_limit_per_pattern
-            )
-            pattern_duration = time.time() - pattern_start_time
-            new_prs = len(prs_by_id) - initial_count
-            print(f"   ✓ Pattern complete: {new_prs} new PRs found ({prs_found} total fetched, {len(prs_by_id) - initial_count - (prs_found - new_prs)} duplicates)")
-            print(f"   ⏱️ Time taken: {pattern_duration:.1f} seconds")
-            # Delay between different query patterns (shorter in debug mode)
-            time.sleep(0.2 if DEBUG_MODE else 1.0)
-    # Convert to lightweight metadata
-    all_prs = list(prs_by_id.values())
-    if DEBUG_MODE:
-        print(f"\n✅ COMPLETE (DEBUG MODE): Found {len(all_prs)} unique PRs for {identifier} on {target_date}")
-        print(f"   Note: In production mode, this would fetch ALL PRs")
-    else:
-        print(f"\n✅ COMPLETE: Found {len(all_prs)} unique PRs for {identifier} on {target_date}")
-    print(f"📦 Extracting minimal metadata...")
-    metadata_list = [extract_pr_metadata(pr) for pr in all_prs]
-    return metadata_list
 def calculate_pr_stats_from_metadata(metadata_list):
     """
     Calculate statistics from a list of PR metadata (lightweight objects).
-    Works with minimal metadata: html_url, created_at, merged_at, closed_at, agent_name.
     Returns a dictionary with comprehensive PR metrics.
@@ -893,11 +374,15 @@ def calculate_pr_stats_from_metadata(metadata_list):
     }
-def calculate_monthly_metrics_by_agent():
     """
-    Calculate monthly metrics for all agents for visualization.
     Loads data directly from SWE-Arena/pr_metadata dataset.
     Returns:
         dict: {
             'agents': list of agent names,
@@ -962,8 +447,7 @@ def calculate_monthly_metrics_by_agent():
         for month in months:
             prs_in_month = month_dict.get(month, [])
-            # Count merged PRs (those with merged_at during this time)
-            # Note: We're filtering by created_at, but counting based on merged_at/closed_at
             merged_count = sum(1 for pr in prs_in_month if pr.get('merged_at'))
             # Count closed but not merged
@@ -989,8 +473,25 @@ def calculate_monthly_metrics_by_agent():
             'closed_not_merged': closed_not_merged_list
         }
     return {
-        'agents': sorted(list(agent_month_data.keys())),
         'months': months,
         'data': result_data
     }
@@ -1026,106 +527,76 @@ def save_pr_metadata_to_hf(metadata_list, agent_identifier):
     """
     Save PR metadata to HuggingFace dataset, organized by [agent_identifier]/YYYY.MM.DD.jsonl.
     Each file is stored in the agent's folder and named YYYY.MM.DD.jsonl for that day's PRs.
-    In debug mode, saves to in-memory cache only.
-    This function APPENDS new metadata and DEDUPLICATES by html_url.
-    Uses batch upload to avoid HuggingFace rate limits (256 commits/hour).
     Args:
         metadata_list: List of PR metadata dictionaries
         agent_identifier: GitHub identifier of the agent (used as folder name)
     """
-    import tempfile
     import shutil
-    # Skip saving to HF in debug mode - use in-memory cache instead
-    if DEBUG_MODE:
-        global DEBUG_PR_METADATA_CACHE
-        # Merge with existing cache, deduplicating by html_url
-        existing = {pr['html_url']: pr for pr in DEBUG_PR_METADATA_CACHE[agent_identifier] if pr.get('html_url')}
-        new = {pr['html_url']: pr for pr in metadata_list if pr.get('html_url')}
-        existing.update(new)
-        DEBUG_PR_METADATA_CACHE[agent_identifier] = list(existing.values())
-        print(f"🐛 DEBUG MODE: Saved to in-memory cache only ({len(metadata_list)} PRs) - NOT saved to HuggingFace")
-        return True
     try:
         token = get_hf_token()
         if not token:
             raise Exception("No HuggingFace token found")
-        api = HfApi()
-        # Group by exact date (year, month, day)
         grouped = group_metadata_by_date(metadata_list)
-        # Create a temporary directory to prepare all files for batch upload
         temp_dir = tempfile.mkdtemp()
-        agent_dir = os.path.join(temp_dir, agent_identifier)
-        os.makedirs(agent_dir, exist_ok=True)
         try:
-            print(f"📦 Preparing {len(grouped)} daily files for batch upload...")
             for (pr_year, month, day), day_metadata in grouped.items():
-                # New structure: [agent_identifier]/YYYY.MM.DD.jsonl
                 filename = f"{agent_identifier}/{pr_year}.{month:02d}.{day:02d}.jsonl"
-                local_path = os.path.join(agent_dir, f"{pr_year}.{month:02d}.{day:02d}.jsonl")
-                print(f"   Preparing {len(day_metadata)} PRs for {filename}...")
-                # Download existing file if it exists
-                existing_metadata = []
-                try:
-                    file_path = hf_hub_download(
-                        repo_id=PR_METADATA_REPO,
-                        filename=filename,
-                        repo_type="dataset",
-                        token=token
-                    )
-                    existing_metadata = load_jsonl(file_path)
-                    print(f"      Found {len(existing_metadata)} existing PRs, merging...")
-                except Exception:
-                    print(f"      No existing file found, creating new...")
-                # Merge and deduplicate by html_url
-                existing_by_url = {meta['html_url']: meta for meta in existing_metadata if meta.get('html_url')}
-                new_by_url = {meta['html_url']: meta for meta in day_metadata if meta.get('html_url')}
-                # Update with new data (new data overwrites old)
-                existing_by_url.update(new_by_url)
-                merged_metadata = list(existing_by_url.values())
-                # Save to temp directory
-                save_jsonl(local_path, merged_metadata)
-                print(f"      ✓ Prepared {len(merged_metadata)} total PRs")
-            # Batch upload entire folder in a single commit
-            print(f"\n📤 Uploading all files for {agent_identifier} in one batch...")
-            api.upload_folder(
                 folder_path=temp_dir,
                 repo_id=PR_METADATA_REPO,
-                repo_type="dataset",
-                token=token,
-                commit_message=f"Update PR metadata for {agent_identifier}"
             )
-            print(f"   ✓ Successfully uploaded {len(grouped)} files in 1 commit")
-        finally:
-            # Clean up temporary directory
-            shutil.rmtree(temp_dir, ignore_errors=True)
-        return True
     except Exception as e:
-        print(f"✗ Error saving PR metadata: {str(e)}")
         return False
 def load_pr_metadata():
     """
     Loads PR metadata from the last LEADERBOARD_TIME_FRAME_DAYS only.
-    In debug mode, loads from in-memory cache if available.
     Structure: [agent_identifier]/YYYY.MM.DD.jsonl
@@ -1133,30 +604,6 @@ def load_pr_metadata():
         List of dictionaries with 'agent_identifier' added to each PR metadata.
         Only includes PRs within the last LEADERBOARD_TIME_FRAME_DAYS.
     """
-    # In debug mode, check in-memory cache first
-    if DEBUG_MODE and DEBUG_PR_METADATA_CACHE:
-        all_metadata = []
-        cutoff_date = datetime.now(timezone.utc) - timedelta(days=LEADERBOARD_TIME_FRAME_DAYS)
-        for agent_identifier, metadata_list in DEBUG_PR_METADATA_CACHE.items():
-            for pr_meta in metadata_list:
-                # Filter by created_at date
-                created_at = pr_meta.get('created_at')
-                if created_at:
-                    try:
-                        dt = datetime.fromisoformat(created_at.replace('Z', '+00:00'))
-                        if dt >= cutoff_date:
-                            pr_with_agent = pr_meta.copy()
-                            pr_with_agent['agent_identifier'] = agent_identifier
-                            all_metadata.append(pr_with_agent)
-                    except Exception:
-                        # If date parsing fails, skip this PR
-                        continue
-        if all_metadata:
-            print(f"🐛 DEBUG MODE: Loading PR metadata from in-memory cache ({len(all_metadata)} PRs from last {LEADERBOARD_TIME_FRAME_DAYS} days)")
-            return all_metadata
     try:
         api = HfApi()
         token = get_hf_token()
@@ -1190,7 +637,8 @@ def load_pr_metadata():
                         # If date parsing fails, skip this file
                         continue
-        print(f"📥 Loading PR metadata from last {LEADERBOARD_TIME_FRAME_DAYS} days ({len(relevant_files)} daily files across all agents)...")
         all_metadata = []
         for filename in relevant_files:
@@ -1232,79 +680,21 @@ def load_pr_metadata():
             except Exception as e:
                 print(f"   Warning: Could not load {filename}: {str(e)}")
-        print(f"✓ Loaded {len(all_metadata)} total PRs from last {LEADERBOARD_TIME_FRAME_DAYS} days")
         return all_metadata
     except Exception as e:
-        print(f"✗ Error loading PR metadata from last {LEADERBOARD_TIME_FRAME_DAYS} days: {str(e)}")
         return []
-def get_latest_pr_date_for_agent(agent_identifier):
-    """
-    Get the latest PR creation date for an agent from stored metadata.
-    Used for incremental updates - only fetch PRs newer than this date.
-    Structure: [agent_identifier]/YYYY.MM.DD.jsonl
-    Args:
-        agent_identifier: GitHub identifier of the agent
-    Returns:
-        datetime or None if no existing PRs found.
-    """
-    try:
-        api = HfApi()
-        token = get_hf_token()
-        # List all files in the repository
-        files = api.list_repo_files(repo_id=PR_METADATA_REPO, repo_type="dataset")
-        # Filter for files in this agent's folder
-        # New structure: [agent_identifier]/YYYY.MM.DD.jsonl
-        agent_pattern = f"{agent_identifier}/"
-        agent_files = [f for f in files if f.startswith(agent_pattern) and f.endswith('.jsonl')]
-        if not agent_files:
-            return None
-        # Find latest created_at across all files
-        latest_date = None
-        for filename in agent_files:
-            try:
-                file_path = hf_hub_download(
-                    repo_id=PR_METADATA_REPO,
-                    filename=filename,
-                    repo_type="dataset",
-                    token=token
-                )
-                metadata = load_jsonl(file_path)
-                for pr in metadata:
-                    created_at = pr.get('created_at')
-                    if created_at:
-                        try:
-                            dt = datetime.fromisoformat(created_at.replace('Z', '+00:00'))
-                            if latest_date is None or dt > latest_date:
-                                latest_date = dt
-                        except Exception:
-                            continue
-            except Exception:
-                continue
-        return latest_date
-    except Exception:
-        return None
-def get_daily_files_last_n_months(agent_identifier, n_months=6):
     """
-    Get list of daily file paths for an agent from the last N months.
     Args:
         agent_identifier: GitHub identifier of the agent
-        n_months: Number of months to look back (default: 6)
     Returns:
         List of file paths in format: [agent_identifier]/YYYY.MM.DD.jsonl
@@ -1313,9 +703,9 @@ def get_daily_files_last_n_months(agent_identifier, n_months=6):
         api = HfApi()
         token = get_hf_token()
-        # Calculate date range
         today = datetime.now(timezone.utc)
-        n_months_ago = today - timedelta(days=30 * n_months)
         # List all files in the repository
         files = api.list_repo_files(repo_id=PR_METADATA_REPO, repo_type="dataset")
@@ -1341,8 +731,8 @@ def get_daily_files_last_n_months(agent_identifier, n_months=6):
                 file_year, file_month, file_day = map(int, date_components)
                 file_date = datetime(file_year, file_month, file_day, tzinfo=timezone.utc)
-                # Include if within last n_months
-                if n_months_ago <= file_date <= today:
                     recent_files.append(filename)
             except Exception:
                 continue
@@ -1354,173 +744,6 @@ def get_daily_files_last_n_months(agent_identifier, n_months=6):
         return []
-def fetch_pr_current_status(pr_url, token, token_pool=None):
-    """
-    Fetch the current status of a single PR from GitHub API.
-    Args:
-        pr_url: PR HTML URL (e.g., https://github.com/owner/repo/pull/123)
-        token: GitHub API token
-        token_pool: Optional TokenPool for rate limit tracking
-    Returns:
-        Dictionary with updated merged_at and closed_at, or None if failed
-    """
-    try:
-        # Convert HTML URL to API URL
-        # https://github.com/owner/repo/pull/123 -> https://api.github.com/repos/owner/repo/pulls/123
-        parts = pr_url.replace('https://github.com/', '').split('/')
-        if len(parts) < 4:
-            return None
-        owner, repo, pull_word, pr_number = parts[0], parts[1], parts[2], parts[3]
-        api_url = f'https://api.github.com/repos/{owner}/{repo}/pulls/{pr_number}'
-        headers = {'Authorization': f'token {token}'} if token else {}
-        response = request_with_backoff('GET', api_url, headers=headers, max_retries=3,
-                                       token_pool=token_pool, token=token)
-        if response is None or response.status_code != 200:
-            return None
-        pr_data = response.json()
-        merged_at = pr_data.get('merged_at')
-        closed_at = pr_data.get('closed_at')
-        # Only store closed_at if not merged
-        if merged_at:
-            closed_at = None
-        return {
-            'merged_at': merged_at,
-            'closed_at': closed_at
-        }
-    except Exception as e:
-        print(f"   Error fetching PR status for {pr_url}: {str(e)}")
-        return None
-def refresh_open_prs_for_agent(agent_identifier, token, token_pool=None):
-    """
-    Refresh status for all open PRs from the last 6 months for an agent.
-    Only updates PRs that are still open (no merged_at, no closed_at).
-    This implements the smart update strategy:
-    - Skip PRs that are already closed/merged
-    - Fetch current status for open PRs
-    - Update and save back to daily files
-    Args:
-        agent_identifier: GitHub identifier of the agent
-        token: GitHub API token
-        token_pool: Optional TokenPool for rate limit tracking
-    Returns:
-        Tuple: (total_checked, updated_count)
-    """
-    print(f"\n🔄 Refreshing open PRs for {agent_identifier} (last 6 months)...")
-    try:
-        # Get daily files from last 6 months
-        recent_files = get_daily_files_last_n_months(agent_identifier, n_months=6)
-        if not recent_files:
-            print(f"   No recent files found for {agent_identifier}")
-            return (0, 0)
-        print(f"   Found {len(recent_files)} daily files to check")
-        total_checked = 0
-        updated_count = 0
-        # Process each file
-        for filename in recent_files:
-            try:
-                # Download file
-                file_path = hf_hub_download(
-                    repo_id=PR_METADATA_REPO,
-                    filename=filename,
-                    repo_type="dataset",
-                    token=get_hf_token()
-                )
-                prs = load_jsonl(file_path)
-                if not prs:
-                    continue
-                updated_prs = []
-                file_had_updates = False
-                # Check each PR
-                for pr in prs:
-                    # Skip if already closed or merged
-                    if pr.get('merged_at') or pr.get('closed_at'):
-                        updated_prs.append(pr)
-                        continue
-                    # PR is open, fetch current status
-                    total_checked += 1
-                    pr_url = pr.get('html_url')
-                    if not pr_url:
-                        updated_prs.append(pr)
-                        continue
-                    current_status = fetch_pr_current_status(pr_url, token, token_pool)
-                    if current_status:
-                        # Check if status changed
-                        if current_status['merged_at'] or current_status['closed_at']:
-                            print(f"   ✓ PR status changed: {pr_url}")
-                            pr['merged_at'] = current_status['merged_at']
-                            pr['closed_at'] = current_status['closed_at']
-                            updated_count += 1
-                            file_had_updates = True
-                    updated_prs.append(pr)
-                    time.sleep(0.1)  # Rate limiting courtesy delay
-                # Save file if there were updates
-                if file_had_updates:
-                    # Extract filename components for local save
-                    parts = filename.split('/')
-                    local_filename = parts[-1]  # Just YYYY.MM.DD.jsonl
-                    # Save locally
-                    save_jsonl(local_filename, updated_prs)
-                    try:
-                        # Upload back to HuggingFace
-                        api = HfApi()
-                        upload_with_retry(
-                            api=api,
-                            path_or_fileobj=local_filename,
-                            path_in_repo=filename,
-                            repo_id=PR_METADATA_REPO,
-                            repo_type="dataset",
-                            token=get_hf_token()
-                        )
-                        print(f"   💾 Updated {filename}")
-                    finally:
-                        # Always clean up local file, even if upload fails
-                        if os.path.exists(local_filename):
-                            os.remove(local_filename)
-            except Exception as e:
-                print(f"   Warning: Could not process {filename}: {str(e)}")
-                continue
-        print(f"   ✅ Refresh complete: {total_checked} open PRs checked, {updated_count} updated")
-        return (total_checked, updated_count)
-    except Exception as e:
-        print(f"   ✗ Error refreshing PRs for {agent_identifier}: {str(e)}")
-        return (0, 0)
 # =============================================================================
 # HUGGINGFACE DATASET OPERATIONS
 # =============================================================================
@@ -1550,6 +773,11 @@ def load_agents_from_hf():
                 with open(file_path, 'r') as f:
                     agent_data = json.load(f)
                     agents.append(agent_data)
             except Exception as e:
@@ -1564,8 +792,6 @@ def load_agents_from_hf():
         return None
 def get_hf_token():
     """Get HuggingFace token from environment variables."""
     token = os.getenv('HF_TOKEN')
@@ -1655,111 +881,105 @@ def save_agent_to_hf(data):
         return False
 # =============================================================================
 # DATA MANAGEMENT
 # =============================================================================
-def update_all_agents_incremental():
     """
-    Daily incremental update - refreshes open PRs and fetches new PRs for all agents.
-    Strategy:
-    1. Refresh status of all open PRs from the last LEADERBOARD_TIME_FRAME_DAYS - 1 days
-       (to check if any have been merged or closed)
-    2. Fetch new PRs created yesterday (from 12:00 AM to 11:59:59 PM yesterday)
-    3. Update the corresponding daily files (YYYY.MM.DD.jsonl)
-    4. This runs daily to keep data fresh without re-mining everything
     """
     print(f"\n{'='*80}")
-    print(f"🕛 Daily Incremental PR Mining started at {datetime.now(timezone.utc).isoformat()}")
-    print(f"{'='*80}")
     try:
-        # Initialize token pool
-        tokens = get_github_tokens()
-        token_pool = TokenPool(tokens)
-        # Also get single token for backward-compatible functions
-        token = token_pool.get_next_token()
-        # Load agent metadata from HuggingFace
-        agents = load_agents_from_hf()
-        if not agents:
-            print("No agents found in HuggingFace dataset")
-            return
-        # Calculate yesterday's date
-        yesterday = (datetime.now(timezone.utc) - timedelta(days=1)).date()
-        print(f"\n📅 Daily Incremental Update for {yesterday.strftime('%Y-%m-%d')} for all agents...")
-        agents_processed = 0
-        total_refreshed = 0
-        total_refreshed_updated = 0
-        total_new_prs = 0
-        # Update each agent
-        for agent in agents:
-            identifier = agent.get('github_identifier')
-            agent_name = agent.get('agent_name', 'Unknown')
-            if not identifier:
-                print(f"Warning: Skipping agent without identifier: {agent}")
-                continue
-            try:
-                print(f"\n{'='*80}")
-                print(f"Processing: {agent_name} ({identifier})")
-                print(f"{'='*80}")
-                # STEP 1: Refresh all open PRs from the last LEADERBOARD_TIME_FRAME_DAYS - 1 days
-                print(f"\n🔄 Step 1: Refreshing open PRs (last {LEADERBOARD_TIME_FRAME_DAYS - 1} days)...")
-                refreshed_checked, refreshed_updated = refresh_open_prs_for_agent(
-                    identifier,
-                    token,
-                    token_pool
-                )
-                total_refreshed += refreshed_checked
-                total_refreshed_updated += refreshed_updated
-                # STEP 2: Fetch new PRs created yesterday (12:00 AM to 11:59:59 PM yesterday)
-                print(f"\n📥 Step 2: Fetching new PRs created on {yesterday.strftime('%Y-%m-%d')} (12:00 AM to 11:59:59 PM)...")
-                new_metadata = fetch_daily_prs_metadata(
-                    identifier,
-                    agent_name,
-                    token_pool,
-                    target_date=yesterday
-                )
-                if new_metadata:
-                    # Save new metadata to HuggingFace
-                    print(f"💾 Saving {len(new_metadata)} new PRs from {yesterday}...")
-                    save_pr_metadata_to_hf(new_metadata, identifier)
-                    total_new_prs += len(new_metadata)
-                else:
-                    print(f"   No new PRs found created on {yesterday}")
-                agents_processed += 1
-            except Exception as e:
-                print(f"✗ Error updating {identifier}: {str(e)}")
-                import traceback
-                traceback.print_exc()
-                continue
-        print(f"\n{'='*80}")
-        print(f"📊 Mining Summary:")
-        print(f"   Total agents processed: {agents_processed}")
-        print(f"   Open PRs refreshed: {total_refreshed} checked, {total_refreshed_updated} updated")
-        print(f"   New PRs added (from yesterday): {total_new_prs}")
-        print(f"{'='*80}")
-        print(f"\n✅ Daily Incremental PR Mining completed at {datetime.now(timezone.utc).isoformat()}")
-    except Exception as e:
-        print(f"✗ Daily mining failed: {str(e)}")
-        import traceback
-        traceback.print_exc()
 def construct_leaderboard_from_metadata():
@@ -1805,15 +1025,26 @@ def construct_leaderboard_from_metadata():
 # UI FUNCTIONS
 # =============================================================================
-def create_monthly_metrics_plot():
     """
     Create a Plotly figure with dual y-axes showing:
     - Left y-axis: Acceptance rate (%) as line curves
     - Right y-axis: Total PRs created as bar charts
     Each agent gets a unique color for both their line and bars.
     """
-    metrics = calculate_monthly_metrics_by_agent()
     if not metrics['agents'] or not metrics['months']:
         # Return an empty figure with a message
@@ -1834,19 +1065,16 @@ def create_monthly_metrics_plot():
     # Create figure with secondary y-axis
     fig = make_subplots(specs=[[{"secondary_y": True}]])
-    # Define colors for agents (using a color palette)
-    colors = [
-        '#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd',
-        '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf'
-    ]
     agents = metrics['agents']
     months = metrics['months']
     data = metrics['data']
     # Add traces for each agent
     for idx, agent_name in enumerate(agents):
-        color = colors[idx % len(colors)]
         agent_data = data[agent_name]
         # Add line trace for acceptance rate (left y-axis)
@@ -1966,13 +1194,11 @@ def get_leaderboard_dataframe():
     return df
 def submit_agent(identifier, agent_name, organization, description, website):
     """
     Submit a new agent to the leaderboard.
     Validates input and saves submission.
-    PR data will be populated by the daily incremental update.
     """
     # Validate required fields
     if not identifier or not identifier.strip():
@@ -2016,7 +1242,7 @@ def submit_agent(identifier, agent_name, organization, description, website):
     if not save_agent_to_hf(submission):
         return "❌ Failed to save submission", get_leaderboard_dataframe(), create_monthly_metrics_plot()
-    success_msg = f"✅ Successfully submitted {agent_name}!\n\nPR data will be populated by the daily incremental update (runs at 12:00 AM UTC)."
     return success_msg, get_leaderboard_dataframe(), create_monthly_metrics_plot()
@@ -2024,73 +1250,65 @@ def submit_agent(identifier, agent_name, organization, description, website):
 # GRADIO APPLICATION
 # =============================================================================
-# Initialize data before creating UI
-if DEBUG_MODE:
-    print("\n" + "="*80)
-    print("🐛 DEBUG MODE ENABLED 🐛")
-    print("="*80)
-    print("PR retrieval is limited to 10 PRs per query pattern per agent")
-    # Show how debug mode was enabled
-    if args.debug:
-        print("Enabled via: command-line flag '--debug'")
-        print("To disable: run without '--debug' flag")
-    else:
-        print("Enabled via: DEBUG_MODE environment variable")
-        print("To disable: run with '--no-debug' flag or unset DEBUG_MODE")
-    print("="*80 + "\n")
-else:
-    print("\n🚀 Starting in PRODUCTION MODE - full PR retrieval enabled")
-    if args.no_debug:
-        print("   (Explicitly set via '--no-debug' flag)")
-    print()
-# Start APScheduler for daily incremental PR mining at 12:00 AM UTC
 scheduler = BackgroundScheduler(timezone="UTC")
 scheduler.add_job(
-    update_all_agents_incremental,
-    trigger=CronTrigger(hour=0, minute=0),  # 12:00 AM UTC daily
-    id='daily_incremental_pr_mining',
-    name='Daily Incremental PR Mining',
     replace_existing=True
 )
 scheduler.start()
-print("✓ Scheduler started: Daily Incremental PR Mining at 12:00 AM UTC")
 # Create Gradio interface
 with gr.Blocks(title="SWE Agent PR Leaderboard", theme=gr.themes.Soft()) as app:
     gr.Markdown("# 🏆 SWE Agent PR Leaderboard")
-    gr.Markdown("Track and compare GitHub pull request statistics for SWE agents (last 6 months)")
     with gr.Tabs():
         # Leaderboard Tab
         with gr.Tab("📊 Leaderboard"):
-            gr.Markdown("*All statistics are based on PRs from the last 6 months*")
             leaderboard_table = Leaderboard(
                 value=get_leaderboard_dataframe(),
                 datatype=LEADERBOARD_COLUMNS,
                 search_columns=["Agent Name", "Website"],
-                filter_columns=["Acceptance Rate (%)"]
             )
-            gr.Markdown("### Monthly Metrics")
-            gr.Markdown("Track acceptance rates and PR activity over time")
             monthly_plot = gr.Plot(
-                value=create_monthly_metrics_plot(),
                 label="Monthly PR Metrics"
             )
         # Submit Agent Tab
         with gr.Tab("➕ Submit Agent"):
             gr.Markdown("### Submit Your Agent")
-            gr.Markdown("Fill in the details below to add your agent to the leaderboard. Make sure you're logged in to HuggingFace CLI on your machine.")
             with gr.Row():
                 with gr.Column():
                     github_input = gr.Textbox(
@@ -2101,7 +1319,7 @@ with gr.Blocks(title="SWE Agent PR Leaderboard", theme=gr.themes.Soft()) as app:
                         label="Agent Name*",
                         placeholder="Your agent's display name"
                     )
                 with gr.Column():
                     organization_input = gr.Textbox(
                         label="Organization*",
@@ -2113,10 +1331,10 @@ with gr.Blocks(title="SWE Agent PR Leaderboard", theme=gr.themes.Soft()) as app:
                         lines=3
                     )
                     website_input = gr.Textbox(
-                        label="Website",
                         placeholder="https://your-agent-website.com"
                     )
             submit_button = gr.Button(
                 "Submit Agent",
                 variant="primary"
@@ -2125,7 +1343,7 @@ with gr.Blocks(title="SWE Agent PR Leaderboard", theme=gr.themes.Soft()) as app:
                 label="Submission Status",
                 interactive=False
             )
             # Event handler
             submit_button.click(
                 fn=submit_agent,
@@ -2136,4 +1354,4 @@ with gr.Blocks(title="SWE Agent PR Leaderboard", theme=gr.themes.Soft()) as app:
 # Launch application
 if __name__ == "__main__":
-    app.launch()

 import gradio as gr
+from gradio_leaderboard import Leaderboard, ColumnFilter
 import json
 import os
 import time
+import tempfile
 import requests
 from datetime import datetime, timezone, timedelta
 from collections import defaultdict
 from huggingface_hub import HfApi, hf_hub_download
 from dotenv import load_dotenv
 import pandas as pd
 import random
 import plotly.graph_objects as go
 from plotly.subplots import make_subplots
 from apscheduler.schedulers.background import BackgroundScheduler
 from apscheduler.triggers.cron import CronTrigger
+from google.cloud import bigquery
 # Load environment variables
 load_dotenv()
 # =============================================================================
 # CONFIGURATION
 # =============================================================================
 AGENTS_REPO = "SWE-Arena/swe_agents"  # HuggingFace dataset for agent metadata
 PR_METADATA_REPO = "SWE-Arena/pr_metadata"  # HuggingFace dataset for PR metadata
+LEADERBOARD_TIME_FRAME_DAYS = 180  # Time frame for constructing leaderboard
+UPDATE_TIME_FRAME_DAYS = 30  # Time frame for mining new PRs
 LEADERBOARD_COLUMNS = [
     ("Agent Name", "string"),
     """Load JSONL file and return list of dictionaries."""
     if not os.path.exists(filename):
         return []
     data = []
     with open(filename, 'r', encoding='utf-8') as f:
         for line in f:
             f.write(json.dumps(item) + '\n')
 # =============================================================================
+# BIGQUERY FUNCTIONS
 # =============================================================================
+def get_bigquery_client():
     """
+    Initialize BigQuery client using credentials from environment variable.
+    Expects GOOGLE_APPLICATION_CREDENTIALS_JSON environment variable containing
+    the service account JSON credentials as a string.
     """
+    # Get the JSON content from environment variable
+    creds_json = os.environ.get('GOOGLE_APPLICATION_CREDENTIALS_JSON')
+    if creds_json:
+        # Create a temporary file to store credentials
+        with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.json') as temp_file:
+            temp_file.write(creds_json)
+            temp_path = temp_file.name
+        # Set environment variable to point to temp file
+        os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = temp_path
+        # Initialize BigQuery client
+        client = bigquery.Client()
+        # Clean up temp file
+        os.unlink(temp_path)
+        return client
     else:
+        raise ValueError("GOOGLE_APPLICATION_CREDENTIALS_JSON not found in environment")
+def generate_table_union_statements(start_date, end_date):
     """
+    Generate UNION ALL statements for githubarchive.day tables in date range.
     Args:
+        start_date: Start datetime
+        end_date: End datetime
+    Returns:
+        String with UNION ALL SELECT statements for all tables in range
     """
+    table_names = []
+    current_date = start_date
+    while current_date < end_date:
+        table_name = f"`githubarchive.day.{current_date.strftime('%Y%m%d')}`"
+        table_names.append(table_name)
+        current_date += timedelta(days=1)
+    # Create UNION ALL chain
+    union_parts = [f"SELECT * FROM {table}" for table in table_names]
+    return " UNION ALL ".join(union_parts)
+def fetch_all_pr_metadata_single_query(client, identifiers, start_date, end_date):
     """
+    Fetch PR metadata for ALL agents using ONE comprehensive BigQuery query.
+    This query fetches:
+    1. PRs authored by agents (user.login matches identifier)
+    2. PRs with co-authored-by (search in body for co-authored-by)
+    3. PRs from branches starting with agent identifier (head.ref pattern)
     Args:
+        client: BigQuery client instance
+        identifiers: List of GitHub usernames/bot identifiers
+        start_date: Start datetime (timezone-aware)
+        end_date: End datetime (timezone-aware)
     Returns:
+        Dictionary mapping agent identifier to list of PR metadata
     """
+    print(f"\n🔍 Querying BigQuery for ALL {len(identifiers)} agents in ONE QUERY")
+    print(f"   Time range: {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}")
+    # Generate table UNION statements for the time range
+    table_union = generate_table_union_statements(start_date, end_date)
+    # Build identifier lists for SQL IN clauses
+    # For author matching, include identifiers with [bot]
+    author_list = ', '.join([f"'{id}'" for id in identifiers if '[bot]' in id])
+    # For branch matching and co-author, use stripped identifiers (without [bot])
+    stripped_identifiers = [id.replace('[bot]', '') for id in identifiers]
+    # Build co-author pattern (search in body)
+    coauthor_patterns = ' OR '.join([f"LOWER(JSON_EXTRACT_SCALAR(payload, '$.pull_request.body')) LIKE '%co-authored-by: {id.lower()}%'"
+                                      for id in stripped_identifiers if id])
+    # Build branch pattern
+    branch_patterns = ' OR '.join([f"JSON_EXTRACT_SCALAR(payload, '$.pull_request.head.ref') LIKE '{id}/%'"
+                                   for id in stripped_identifiers if id])
+    # Build comprehensive query with CTE
+    query = f"""
+    WITH pr_events AS (
+      -- Get all PR events (opened, closed) for all agents
+      SELECT
+        JSON_EXTRACT_SCALAR(payload, '$.pull_request.html_url') as html_url,
+        JSON_EXTRACT_SCALAR(payload, '$.pull_request.user.login') as pr_author,
+        JSON_EXTRACT_SCALAR(payload, '$.pull_request.head.ref') as branch_name,
+        JSON_EXTRACT_SCALAR(payload, '$.pull_request.body') as pr_body,
+        JSON_EXTRACT_SCALAR(payload, '$.pull_request.created_at') as created_at,
+        CAST(JSON_EXTRACT_SCALAR(payload, '$.pull_request.merged') AS BOOL) as is_merged,
+        JSON_EXTRACT_SCALAR(payload, '$.pull_request.merged_at') as merged_at,
+        JSON_EXTRACT_SCALAR(payload, '$.pull_request.closed_at') as closed_at,
+        JSON_EXTRACT_SCALAR(payload, '$.action') as action,
+        created_at as event_time
+      FROM (
+        {table_union}
+      )
+      WHERE
+        type = 'PullRequestEvent'
+        AND JSON_EXTRACT_SCALAR(payload, '$.pull_request.html_url') IS NOT NULL
+        AND (
+          -- Match PRs authored by agents with [bot] suffix
+          {f"JSON_EXTRACT_SCALAR(payload, '$.pull_request.user.login') IN ({author_list})" if author_list else "FALSE"}
+          {" OR " if author_list and (coauthor_patterns or branch_patterns) else ""}
+          -- Match PRs with co-authored-by in body
+          {f"({coauthor_patterns})" if coauthor_patterns else ""}
+          {" OR " if coauthor_patterns and branch_patterns else ""}
+          -- Match PRs with branch names starting with agent identifier
+          {f"({branch_patterns})" if branch_patterns else ""}
+        )
+    ),
+    pr_latest_state AS (
+      -- Get the latest state for each PR (most recent event)
+      SELECT
+        html_url,
+        pr_author,
+        branch_name,
+        pr_body,
+        created_at,
+        merged_at,
+        closed_at,
+        ROW_NUMBER() OVER (PARTITION BY html_url ORDER BY event_time DESC) as row_num
+      FROM pr_events
+    )
+    -- Return deduplicated PR metadata
+    SELECT DISTINCT
+      html_url,
+      pr_author,
+      branch_name,
+      pr_body,
+      created_at,
+      merged_at,
+      -- Only include closed_at if PR is closed but not merged
+      CASE
+        WHEN merged_at IS NOT NULL THEN NULL
+        ELSE closed_at
+      END as closed_at
+    FROM pr_latest_state
+    WHERE row_num = 1
+    ORDER BY created_at DESC
+    """
+    print(f"   Querying {(end_date - start_date).days} days of GitHub Archive data...")
+    print(f"   Agents: {', '.join(identifiers[:5])}{'...' if len(identifiers) > 5 else ''}")
+    try:
+        query_job = client.query(query)
+        results = list(query_job.result())
+        print(f"   ✓ Found {len(results)} total PRs across all agents")
+        # Group results by agent
+        metadata_by_agent = defaultdict(list)
+        for row in results:
+            # Convert datetime objects to ISO strings
+            created_at = row.created_at
+            if hasattr(created_at, 'isoformat'):
+                created_at = created_at.isoformat()
+            merged_at = row.merged_at
+            if hasattr(merged_at, 'isoformat'):
+                merged_at = merged_at.isoformat()
+            closed_at = row.closed_at
+            if hasattr(closed_at, 'isoformat'):
+                closed_at = closed_at.isoformat()
+            pr_data = {
+                'html_url': row.html_url,
+                'created_at': created_at,
+                'merged_at': merged_at,
+                'closed_at': closed_at,
+            }
+            # Assign to agent based on author, co-author, or branch pattern
+            pr_author = row.pr_author
+            branch_name = row.branch_name or ''
+            pr_body = (row.pr_body or '').lower()
+            # First, try to match by author
+            if pr_author and pr_author in identifiers:
+                metadata_by_agent[pr_author].append(pr_data)
+            else:
+                # Try to match by co-author or branch pattern
+                for identifier in identifiers:
+                    stripped_id = identifier.replace('[bot]', '')
+                    if not stripped_id:
+                        continue
+                    # Check co-author
+                    if f'co-authored-by: {stripped_id.lower()}' in pr_body:
+                        metadata_by_agent[identifier].append(pr_data)
+                        break
+                    # Check branch pattern
+                    if branch_name.startswith(f"{stripped_id}/"):
+                        metadata_by_agent[identifier].append(pr_data)
+                        break
+        # Print breakdown by agent
+        print(f"\n   📊 Results breakdown by agent:")
+        for identifier in identifiers:
+            count = len(metadata_by_agent.get(identifier, []))
+            if count > 0:
+                metadata = metadata_by_agent[identifier]
+                merged_count = sum(1 for m in metadata if m['merged_at'] is not None)
+                closed_count = sum(1 for m in metadata if m['closed_at'] is not None and m['merged_at'] is None)
+                open_count = count - merged_count - closed_count
+                print(f"      {identifier}: {count} PRs ({merged_count} merged, {closed_count} closed, {open_count} open)")
+        # Convert defaultdict to regular dict
+        return dict(metadata_by_agent)
+    except Exception as e:
+        print(f"   ✗ BigQuery error: {str(e)}")
+        import traceback
+        traceback.print_exc()
+        return {}
+# =============================================================================
+# GITHUB API OPERATIONS (Minimal - Only for Validation)
+# =============================================================================
+def get_github_token():
+    """Get first GitHub token from environment variables."""
+    token = os.getenv('GITHUB_TOKEN')
+    if not token:
+        print("Warning: GITHUB_TOKEN not found. Validation will be limited.")
+    return token
+def validate_github_username(identifier):
+    """Verify that a GitHub identifier exists (simple validation)."""
+    try:
+        token = get_github_token()
+        headers = {'Authorization': f'token {token}'} if token else {}
+        url = f'https://api.github.com/users/{identifier}'
+        response = requests.get(url, headers=headers, timeout=10)
+        if response.status_code == 200:
+            return True, "Username is valid"
+        elif response.status_code == 404:
+            return False, "GitHub identifier not found"
+        else:
+            return False, f"Validation error: HTTP {response.status_code}"
+    except Exception as e:
+        return False, f"Validation error: {str(e)}"
+# =============================================================================
+# PR STATISTICS
+# =============================================================================
 def calculate_pr_stats_from_metadata(metadata_list):
     """
     Calculate statistics from a list of PR metadata (lightweight objects).
+    Works with minimal metadata: html_url, created_at, merged_at, closed_at.
     Returns a dictionary with comprehensive PR metrics.
     }
+def calculate_monthly_metrics_by_agent(top_n=None):
     """
+    Calculate monthly metrics for all agents (or top N agents) for visualization.
     Loads data directly from SWE-Arena/pr_metadata dataset.
+    Args:
+        top_n: If specified, only return metrics for the top N agents by total PRs.
+               Agents are ranked by their total PR count across all months.
     Returns:
         dict: {
             'agents': list of agent names,
         for month in months:
             prs_in_month = month_dict.get(month, [])
+            # Count merged PRs
             merged_count = sum(1 for pr in prs_in_month if pr.get('merged_at'))
             # Count closed but not merged
             'closed_not_merged': closed_not_merged_list
         }
+    # Filter to top N agents if specified
+    agents_list = sorted(list(agent_month_data.keys()))
+    if top_n is not None and top_n > 0:
+        # Calculate total PRs for each agent across all months
+        agent_totals = []
+        for agent_name in agents_list:
+            total_pr_count = sum(result_data[agent_name]['total_prs'])
+            agent_totals.append((agent_name, total_pr_count))
+        # Sort by total PRs (descending) and take top N
+        agent_totals.sort(key=lambda x: x[1], reverse=True)
+        top_agents = [agent_name for agent_name, _ in agent_totals[:top_n]]
+        # Filter result_data to only include top agents
+        result_data = {agent: result_data[agent] for agent in top_agents if agent in result_data}
+        agents_list = top_agents
     return {
+        'agents': agents_list,
         'months': months,
         'data': result_data
     }
     """
     Save PR metadata to HuggingFace dataset, organized by [agent_identifier]/YYYY.MM.DD.jsonl.
     Each file is stored in the agent's folder and named YYYY.MM.DD.jsonl for that day's PRs.
+    This function OVERWRITES existing files completely with fresh data from BigQuery.
+    Uses batch upload to avoid rate limit (uploads entire folder in single operation).
     Args:
         metadata_list: List of PR metadata dictionaries
         agent_identifier: GitHub identifier of the agent (used as folder name)
     """
     import shutil
     try:
         token = get_hf_token()
         if not token:
             raise Exception("No HuggingFace token found")
+        api = HfApi(token=token)
+        # Group by date (year, month, day)
         grouped = group_metadata_by_date(metadata_list)
+        if not grouped:
+            print(f"   No valid metadata to save for {agent_identifier}")
+            return False
+        # Create a temporary directory for batch upload
         temp_dir = tempfile.mkdtemp()
+        agent_folder = os.path.join(temp_dir, agent_identifier)
+        os.makedirs(agent_folder, exist_ok=True)
         try:
+            print(f"   📦 Preparing batch upload for {len(grouped)} daily files...")
+            # Process each daily file
             for (pr_year, month, day), day_metadata in grouped.items():
                 filename = f"{agent_identifier}/{pr_year}.{month:02d}.{day:02d}.jsonl"
+                local_filename = os.path.join(agent_folder, f"{pr_year}.{month:02d}.{day:02d}.jsonl")
+                # Sort by created_at for better organization
+                day_metadata.sort(key=lambda x: x.get('created_at', ''), reverse=True)
+                # Save to temp directory (complete overwrite, no merging)
+                save_jsonl(local_filename, day_metadata)
+                print(f"      Prepared {len(day_metadata)} PRs for {filename}")
+            # Upload entire folder using upload_large_folder (optimized for large files)
+            print(f"   📤 Uploading {len(grouped)} files ({len(metadata_list)} total PRs)...")
+            api.upload_large_folder(
                 folder_path=temp_dir,
                 repo_id=PR_METADATA_REPO,
+                repo_type="dataset"
             )
+            print(f"   ✓ Batch upload complete for {agent_identifier}")
+            return True
+        finally:
+            # Always clean up temp directory
+            if os.path.exists(temp_dir):
+                shutil.rmtree(temp_dir)
     except Exception as e:
+        print(f"   ✗ Error saving PR metadata: {str(e)}")
+        import traceback
+        traceback.print_exc()
         return False
 def load_pr_metadata():
     """
     Loads PR metadata from the last LEADERBOARD_TIME_FRAME_DAYS only.
     Structure: [agent_identifier]/YYYY.MM.DD.jsonl
         List of dictionaries with 'agent_identifier' added to each PR metadata.
         Only includes PRs within the last LEADERBOARD_TIME_FRAME_DAYS.
     """
     try:
         api = HfApi()
         token = get_hf_token()
                         # If date parsing fails, skip this file
                         continue
+        total_months = LEADERBOARD_TIME_FRAME_DAYS // 30
+        print(f"📥 Loading PR metadata from last {total_months} months ({len(relevant_files)} daily files across all agents)...")
         all_metadata = []
         for filename in relevant_files:
             except Exception as e:
                 print(f"   Warning: Could not load {filename}: {str(e)}")
+        print(f"✓ Loaded {len(all_metadata)} total PRs from last {total_months} months")
         return all_metadata
     except Exception as e:
+        total_months = LEADERBOARD_TIME_FRAME_DAYS // 30
+        print(f"✗ Error loading PR metadata from last {total_months} months: {str(e)}")
         return []
+def get_daily_files_last_time_frame(agent_identifier):
     """
+    Get list of daily file paths for an agent from the configured time frame.
     Args:
         agent_identifier: GitHub identifier of the agent
     Returns:
         List of file paths in format: [agent_identifier]/YYYY.MM.DD.jsonl
         api = HfApi()
         token = get_hf_token()
+        # Calculate date range using configured time frame
         today = datetime.now(timezone.utc)
+        cutoff_date = today - timedelta(days=LEADERBOARD_TIME_FRAME_DAYS)
         # List all files in the repository
         files = api.list_repo_files(repo_id=PR_METADATA_REPO, repo_type="dataset")
                 file_year, file_month, file_day = map(int, date_components)
                 file_date = datetime(file_year, file_month, file_day, tzinfo=timezone.utc)
+                # Include if within configured time frame
+                if cutoff_date <= file_date <= today:
                     recent_files.append(filename)
             except Exception:
                 continue
         return []
 # =============================================================================
 # HUGGINGFACE DATASET OPERATIONS
 # =============================================================================
                 with open(file_path, 'r') as f:
                     agent_data = json.load(f)
+                    # Extract github_identifier from filename (remove .json extension)
+                    github_identifier = json_file.replace('.json', '')
+                    agent_data['github_identifier'] = github_identifier
                     agents.append(agent_data)
             except Exception as e:
         return None
 def get_hf_token():
     """Get HuggingFace token from environment variables."""
     token = os.getenv('HF_TOKEN')
         return False
 # =============================================================================
 # DATA MANAGEMENT
 # =============================================================================
+def mine_all_agents():
     """
+    Mine PR metadata for all agents within UPDATE_TIME_FRAME_DAYS and save to HuggingFace.
+    Uses ONE BigQuery query for ALL agents (most efficient approach).
+    This runs weekly to refresh the data with the latest PRs from the past UPDATE_TIME_FRAME_DAYS.
     """
+    # Load agent metadata from HuggingFace
+    agents = load_agents_from_hf()
+    if not agents:
+        print("No agents found in HuggingFace dataset")
+        return
+    # Extract all identifiers
+    identifiers = [agent['github_identifier'] for agent in agents if agent.get('github_identifier')]
+    if not identifiers:
+        print("No valid agent identifiers found")
+        return
     print(f"\n{'='*80}")
+    print(f"Starting PR metadata mining for {len(identifiers)} agents")
+    print(f"Time frame: Last {UPDATE_TIME_FRAME_DAYS} days")
+    print(f"Data source: BigQuery + GitHub Archive (ONE QUERY FOR ALL AGENTS)")
+    print(f"{'='*80}\n")
+    # Initialize BigQuery client
     try:
+        client = get_bigquery_client()
+    except Exception as e:
+        print(f"✗ Failed to initialize BigQuery client: {str(e)}")
+        return
+    # Define time range: past UPDATE_TIME_FRAME_DAYS (excluding today)
+    current_time = datetime.now(timezone.utc)
+    end_date = current_time.replace(hour=0, minute=0, second=0, microsecond=0)
+    start_date = end_date - timedelta(days=UPDATE_TIME_FRAME_DAYS)
+    try:
+        all_metadata = fetch_all_pr_metadata_single_query(
+            client, identifiers, start_date, end_date
+        )
+    except Exception as e:
+        print(f"✗ Error during BigQuery fetch: {str(e)}")
+        import traceback
+        traceback.print_exc()
+        return
+    # Save results for each agent
+    print(f"\n{'='*80}")
+    print(f"💾 Saving results to HuggingFace for each agent...")
+    print(f"{'='*80}\n")
+    success_count = 0
+    error_count = 0
+    no_data_count = 0
+    for i, agent in enumerate(agents, 1):
+        identifier = agent.get('github_identifier')
+        agent_name = agent.get('agent_name', 'Unknown')
+        if not identifier:
+            print(f"[{i}/{len(agents)}] Skipping agent without identifier")
+            error_count += 1
+            continue
+        metadata = all_metadata.get(identifier, [])
+        print(f"[{i}/{len(agents)}] {agent_name} ({identifier}):")
+        try:
+            if metadata:
+                print(f"   💾 Saving {len(metadata)} PR records...")
+                if save_pr_metadata_to_hf(metadata, identifier):
+                    success_count += 1
+                else:
+                    error_count += 1
+            else:
+                print(f"   No PRs found")
+                no_data_count += 1
+        except Exception as e:
+            print(f"   ✗ Error saving {identifier}: {str(e)}")
+            import traceback
+            traceback.print_exc()
+            error_count += 1
+            continue
+    print(f"\n{'='*80}")
+    print(f"✅ Mining complete!")
+    print(f"   Total agents: {len(agents)}")
+    print(f"   Successfully saved: {success_count}")
+    print(f"   No data (skipped): {no_data_count}")
+    print(f"   Errors: {error_count}")
+    print(f"   BigQuery queries executed: 1")
+    print(f"{'='*80}\n")
 def construct_leaderboard_from_metadata():
 # UI FUNCTIONS
 # =============================================================================
+def generate_color(index, total):
+    """Generate distinct colors using HSL color space for better distribution"""
+    hue = (index * 360 / total) % 360
+    saturation = 70 + (index % 3) * 10  # Vary saturation slightly
+    lightness = 45 + (index % 2) * 10   # Vary lightness slightly
+    return f'hsl({hue}, {saturation}%, {lightness}%)'
+def create_monthly_metrics_plot(top_n=5):
     """
     Create a Plotly figure with dual y-axes showing:
     - Left y-axis: Acceptance rate (%) as line curves
     - Right y-axis: Total PRs created as bar charts
     Each agent gets a unique color for both their line and bars.
+    Args:
+        top_n: Number of top agents to show (default: 5)
     """
+    metrics = calculate_monthly_metrics_by_agent(top_n=top_n)
     if not metrics['agents'] or not metrics['months']:
         # Return an empty figure with a message
     # Create figure with secondary y-axis
     fig = make_subplots(specs=[[{"secondary_y": True}]])
     agents = metrics['agents']
     months = metrics['months']
     data = metrics['data']
+    # Generate colors for all agents using HSL
+    agent_colors = {agent: generate_color(idx, len(agents)) for idx, agent in enumerate(agents)}
     # Add traces for each agent
     for idx, agent_name in enumerate(agents):
+        color = agent_colors[agent_name]
         agent_data = data[agent_name]
         # Add line trace for acceptance rate (left y-axis)
     return df
 def submit_agent(identifier, agent_name, organization, description, website):
     """
     Submit a new agent to the leaderboard.
     Validates input and saves submission.
+    PR data will be populated by the weekly mining task.
     """
     # Validate required fields
     if not identifier or not identifier.strip():
     if not save_agent_to_hf(submission):
         return "❌ Failed to save submission", get_leaderboard_dataframe(), create_monthly_metrics_plot()
+    success_msg = f"✅ Successfully submitted {agent_name}!\n\nPR data will be populated by the weekly mining task (runs every Monday at 12:00 AM UTC)."
     return success_msg, get_leaderboard_dataframe(), create_monthly_metrics_plot()
 # GRADIO APPLICATION
 # =============================================================================
+print(f"\n🚀 Starting SWE Agent PR Leaderboard")
+print(f"   Leaderboard time frame: {LEADERBOARD_TIME_FRAME_DAYS} days ({LEADERBOARD_TIME_FRAME_DAYS // 30} months)")
+print(f"   Mining update frequency: Every {UPDATE_TIME_FRAME_DAYS} days\n")
+# Start APScheduler for weekly PR mining at 12:00 AM UTC every Monday
 scheduler = BackgroundScheduler(timezone="UTC")
 scheduler.add_job(
+    mine_all_agents,
+    trigger=CronTrigger(day_of_week='mon', hour=0, minute=0),  # 12:00 AM UTC every Monday
+    id='weekly_pr_mining',
+    name='Weekly PR Mining',
     replace_existing=True
 )
 scheduler.start()
+print(f"✓ Scheduler started: Weekly PR Mining at 12:00 AM UTC every Monday (mines last {UPDATE_TIME_FRAME_DAYS} days)")
 # Create Gradio interface
 with gr.Blocks(title="SWE Agent PR Leaderboard", theme=gr.themes.Soft()) as app:
+    total_months = LEADERBOARD_TIME_FRAME_DAYS // 30
     gr.Markdown("# 🏆 SWE Agent PR Leaderboard")
+    gr.Markdown(f"Track and compare GitHub pull request statistics for SWE agents (last {total_months} months)")
     with gr.Tabs():
         # Leaderboard Tab
         with gr.Tab("📊 Leaderboard"):
+            gr.Markdown(f"*All statistics are based on PRs from the last {total_months} months*")
             leaderboard_table = Leaderboard(
                 value=get_leaderboard_dataframe(),
                 datatype=LEADERBOARD_COLUMNS,
                 search_columns=["Agent Name", "Website"],
+                filter_columns=[
+                    ColumnFilter(
+                        "Acceptance Rate (%)",
+                        min=0,
+                        max=100,
+                        default=[0, 100],
+                        type="slider",
+                        label="Acceptance Rate (%)"
+                    )
+                ]
             )
+            gr.Markdown("### Monthly Metrics - Top 5 Agents")
+            gr.Markdown("Track acceptance rates and PR activity over time for the most active agents")
             monthly_plot = gr.Plot(
+                value=create_monthly_metrics_plot(top_n=5),
                 label="Monthly PR Metrics"
             )
         # Submit Agent Tab
         with gr.Tab("➕ Submit Agent"):
             gr.Markdown("### Submit Your Agent")
+            gr.Markdown("Fill in the details below to add your agent to the leaderboard.")
             with gr.Row():
                 with gr.Column():
                     github_input = gr.Textbox(
                         label="Agent Name*",
                         placeholder="Your agent's display name"
                     )
                 with gr.Column():
                     organization_input = gr.Textbox(
                         label="Organization*",
                         lines=3
                     )
                     website_input = gr.Textbox(
+                        label="Website*",
                         placeholder="https://your-agent-website.com"
                     )
             submit_button = gr.Button(
                 "Submit Agent",
                 variant="primary"
                 label="Submission Status",
                 interactive=False
             )
             # Event handler
             submit_button.click(
                 fn=submit_agent,
 # Launch application
 if __name__ == "__main__":
+    app.launch()