Spaces:

SWE-Arena
/

SWE-PR

Sleeping

App Files Files Community

zhimin-z commited on Nov 15

Commit

17e5dc2

1 Parent(s): 3f0eb80

add streaming

Browse files

Files changed (1) hide show

msr.py +180 -416

msr.py CHANGED Viewed

@@ -1,8 +1,3 @@
-"""
-Minimalist PR Metadata Mining Script
-Mines PR metadata from locally downloaded GHArchive data via DuckDB and saves to HuggingFace dataset.
-"""
 import json
 import os
 import time
@@ -30,31 +25,35 @@ load_dotenv()
 AGENTS_REPO = "SWE-Arena/bot_metadata"
 PR_METADATA_REPO = "SWE-Arena/pr_metadata"
-LEADERBOARD_REPO = "SWE-Arena/leaderboard_metadata"  # HuggingFace dataset for leaderboard data
-LEADERBOARD_TIME_FRAME_DAYS = 180  # Time frame for leaderboard
-GHARCHIVE_DATA_DIR = "../gharchive/data"  # Local GHArchive data directory
-DUCKDB_CACHE_FILE = "cache.duckdb"  # Persistent DuckDB database for caching
-# DuckDB performance configuration
-DUCKDB_THREADS = 8  # Number of threads for parallel processing
-DUCKDB_MEMORY_LIMIT = "64GB"  # Memory limit to prevent OOM crashes
 # Download configuration
-DOWNLOAD_WORKERS = 4  # Number of parallel download threads
-DOWNLOAD_RETRY_DELAY = 2  # Initial retry delay in seconds
-MAX_RETRIES = 5  # Maximum number of retries for each API call
 # Upload configuration
-UPLOAD_DELAY_SECONDS = 5  # Delay between individual file uploads to avoid rate limits
-UPLOAD_INITIAL_BACKOFF = 60  # Initial backoff time in seconds (1 minute)
-UPLOAD_MAX_BACKOFF = 3600  # Maximum backoff time in seconds (60 minutes)
 # Scheduler configuration
-SCHEDULE_ENABLED = True  # Enable/disable scheduler
-SCHEDULE_DAY_OF_MONTH = 8  # Day of month (1-31) - 8nd is in the second week
-SCHEDULE_HOUR = 0  # Hour (0-23) - 12am midnight
-SCHEDULE_MINUTE = 0  # Minute (0-59)
-SCHEDULE_TIMEZONE = 'UTC'  # Timezone for scheduling
 # =============================================================================
 # UTILITY FUNCTIONS
@@ -85,40 +84,24 @@ def save_jsonl(filename, data):
 def normalize_date_format(date_string):
-    """
-    Convert date strings or datetime objects to standardized ISO 8601 format with Z suffix.
-    Handles both 'T' and space-separated datetime formats (including newlines).
-    Examples:
-    - 2025-10-15T23:23:47.983068 -> 2025-10-15T23:23:47Z
-    - 2025-06-17 21:21:07+00 -> 2025-06-17T21:21:07Z
-    - datetime object -> 2025-10-15T23:23:47Z
-    """
     if not date_string or date_string == 'N/A':
         return 'N/A'
     try:
         import re
-        # Handle datetime objects directly
         if isinstance(date_string, datetime):
             return date_string.strftime('%Y-%m-%dT%H:%M:%SZ')
-        # Remove all whitespace (spaces, newlines, tabs) and replace with single space
         date_string = re.sub(r'\s+', ' ', date_string.strip())
-        # Replace space with 'T' for ISO format compatibility
         date_string = date_string.replace(' ', 'T')
-        # Fix incomplete timezone offset (+00 or -00 -> +00:00 or -00:00)
-        # Check if timezone offset exists and is incomplete
         if len(date_string) >= 3:
             if date_string[-3:-2] in ('+', '-') and ':' not in date_string[-3:]:
                 date_string = date_string + ':00'
-        # Parse the date string (handles both with and without microseconds)
         dt = datetime.fromisoformat(date_string.replace('Z', '+00:00'))
-        # Convert to standardized format
         return dt.strftime('%Y-%m-%dT%H:%M:%SZ')
     except Exception as e:
         print(f"Warning: Could not parse date '{date_string}': {e}")
@@ -138,23 +121,13 @@ def get_hf_token():
 # =============================================================================
 def download_file(url):
-    """
-    Download a GHArchive file with retry logic.
-    Args:
-        url: URL to download
-    Returns:
-        bool: True if successful, False otherwise
-    """
     filename = url.split("/")[-1]
     filepath = os.path.join(GHARCHIVE_DATA_DIR, filename)
-    # Skip if json.gz already exists
     if os.path.exists(filepath):
         return True
-    # Download with retry logic
     for attempt in range(MAX_RETRIES):
         try:
             response = requests.get(url, timeout=30)
@@ -165,12 +138,10 @@ def download_file(url):
         except requests.exceptions.HTTPError as e:
             if e.response.status_code == 404:
-                # File doesn't exist, don't retry
                 return False
             else:
-                # Other HTTP errors, retry
                 if attempt < MAX_RETRIES - 1:
-                    wait_time = DOWNLOAD_RETRY_DELAY * (2 ** attempt)  # Exponential backoff
                     print(f"   ⚠ {filename}: HTTP error {e.response.status_code}, retrying in {wait_time}s (attempt {attempt + 1}/{MAX_RETRIES})")
                     time.sleep(wait_time)
                 else:
@@ -179,16 +150,14 @@ def download_file(url):
         except (requests.exceptions.Timeout,
                 requests.exceptions.ConnectionError,
                 requests.exceptions.ReadTimeout) as e:
-            # Timeout/connection errors, retry
             if attempt < MAX_RETRIES - 1:
-                wait_time = DOWNLOAD_RETRY_DELAY * (2 ** attempt)  # Exponential backoff
                 print(f"   ⚠ {filename}: {type(e).__name__}, retrying in {wait_time}s (attempt {attempt + 1}/{MAX_RETRIES})")
                 time.sleep(wait_time)
             else:
                 print(f"   ✗ {filename}: Failed after {MAX_RETRIES} attempts - {type(e).__name__}")
         except Exception as e:
-            # Other errors, retry
             if attempt < MAX_RETRIES - 1:
                 wait_time = DOWNLOAD_RETRY_DELAY * (2 ** attempt)
                 print(f"   ⚠ {filename}: {e}, retrying in {wait_time}s (attempt {attempt + 1}/{MAX_RETRIES})")
@@ -200,17 +169,9 @@ def download_file(url):
 def download_all_gharchive_data():
-    """
-    Download all GHArchive data files for the last LEADERBOARD_TIME_FRAME_DAYS.
-    Uses parallel downloads with ThreadPoolExecutor.
-    Returns:
-        bool: True if all downloads completed (some may have failed), False if critical error
-    """
-    # Create data directory if it doesn't exist
     os.makedirs(GHARCHIVE_DATA_DIR, exist_ok=True)
-    # Generate URLs for last N days (hourly files: 0-23 for each day)
     end_date = datetime.now()
     start_date = end_date - timedelta(days=LEADERBOARD_TIME_FRAME_DAYS)
@@ -218,7 +179,6 @@ def download_all_gharchive_data():
     current_date = start_date
     while current_date <= end_date:
         date_str = current_date.strftime("%Y-%m-%d")
-        # Generate hourly URLs for this day (0-23)
         for hour in range(24):
             url = f"https://data.gharchive.org/{date_str}-{hour}.json.gz"
             urls.append(url)
@@ -228,10 +188,7 @@ def download_all_gharchive_data():
     try:
         with ThreadPoolExecutor(max_workers=DOWNLOAD_WORKERS) as executor:
-            # Submit all downloads
             futures = [executor.submit(download_file, url) for url in urls]
-            # Wait for downloads to complete
             for future in as_completed(futures):
                 downloads_processed += 1
@@ -246,25 +203,20 @@ def download_all_gharchive_data():
 # =============================================================================
-# HUGGINGFACE API WRAPPERS WITH ENHANCED BACKOFF
 # =============================================================================
 def is_retryable_error(e):
-    """
-    Check if exception is retryable (rate limit or timeout error).
-    """
-    # Check for rate limit error (429)
     if isinstance(e, HfHubHTTPError):
         if e.response.status_code == 429:
             return True
-    # Check for timeout errors
     if isinstance(e, (requests.exceptions.Timeout,
                      requests.exceptions.ReadTimeout,
                      requests.exceptions.ConnectTimeout)):
         return True
-    # Check if it's a timeout error wrapped in HfHubHTTPError
     if isinstance(e, Exception):
         error_str = str(e).lower()
         if 'timeout' in error_str or 'timed out' in error_str:
@@ -285,7 +237,7 @@ def is_retryable_error(e):
     )
 )
 def list_repo_files_with_backoff(api, **kwargs):
-    """Wrapper for api.list_repo_files() with exponential backoff for retryable errors."""
     return api.list_repo_files(**kwargs)
@@ -301,7 +253,7 @@ def list_repo_files_with_backoff(api, **kwargs):
     )
 )
 def hf_hub_download_with_backoff(**kwargs):
-    """Wrapper for hf_hub_download() with exponential backoff for retryable errors."""
     return hf_hub_download(**kwargs)
@@ -317,7 +269,7 @@ def hf_hub_download_with_backoff(**kwargs):
     )
 )
 def upload_file_with_backoff(api, **kwargs):
-    """Wrapper for api.upload_file() with exponential backoff for retryable errors."""
     return api.upload_file(**kwargs)
@@ -333,44 +285,30 @@ def upload_file_with_backoff(api, **kwargs):
     )
 )
 def upload_folder_with_backoff(api, **kwargs):
-    """Wrapper for api.upload_folder() with exponential backoff for retryable errors."""
     return api.upload_folder(**kwargs)
 def get_duckdb_connection():
     """
-    Initialize DuckDB connection with persistent database and optimized parallelization.
-    Returns:
-        DuckDB connection object
     """
-    # Use persistent database for caching results
     conn = duckdb.connect(DUCKDB_CACHE_FILE)
-    # Optimize for parallel processing with memory limits
-    conn.execute(f"SET threads TO {DUCKDB_THREADS};")  # Configure parallel threads
-    conn.execute("SET preserve_insertion_order = false;")  # Better parallelization
-    conn.execute("SET enable_object_cache = true;")  # Cache objects for reuse
-    conn.execute("SET temp_directory = '/tmp/duckdb_temp';")  # Use fast temp storage if needed
-    conn.execute(f"SET memory_limit = '{DUCKDB_MEMORY_LIMIT}';")  # Limit memory to prevent OOM crashes
-    conn.execute(f"SET max_memory = '{DUCKDB_MEMORY_LIMIT}';")  # Hard memory cap
     return conn
 def generate_file_path_patterns(start_date, end_date, data_dir=GHARCHIVE_DATA_DIR):
-    """
-    Generate file path patterns for GHArchive data in date range.
-    Only includes files that actually exist on disk.
-    Args:
-        start_date: Start datetime
-        end_date: End datetime
-        data_dir: Directory containing GHArchive data files
-    Returns:
-        List of file path patterns (hourly JSON.gz files) that exist
-    """
     file_patterns = []
     missing_dates = set()
@@ -378,184 +316,160 @@ def generate_file_path_patterns(start_date, end_date, data_dir=GHARCHIVE_DATA_DI
     end_day = end_date.replace(hour=0, minute=0, second=0, microsecond=0)
     while current_date <= end_day:
-        # Pattern for hourly JSON.gz files: 2024-11-15-{0..23}.json.gz
         date_has_files = False
         for hour in range(24):
             pattern = os.path.join(data_dir, f"{current_date.strftime('%Y-%m-%d')}-{hour}.json.gz")
-            # Only add pattern if file exists
             if os.path.exists(pattern):
                 file_patterns.append(pattern)
                 date_has_files = True
-        # Track missing dates
         if not date_has_files:
             missing_dates.add(current_date.strftime('%Y-%m-%d'))
-        # Move to next day
         current_date += timedelta(days=1)
-    # Print warning about missing dates
     if missing_dates:
-        print(f"   Warning: Skipping {len(missing_dates)} date(s) with no data files: {', '.join(sorted(missing_dates))}")
     return file_patterns
 # =============================================================================
-# DUCKDB QUERY FUNCTIONS
 # =============================================================================
-def fetch_all_pr_metadata_single_query(conn, identifiers, start_date, end_date):
-    """
-    Fetch PR metadata for ALL agents using ONE comprehensive DuckDB query.
-    This query fetches:
-    1. PRs authored by agents (user.login matches identifier)
-    2. PR status (opened, merged, closed)
     Args:
         conn: DuckDB connection instance
-        identifiers: List of GitHub usernames/bot identifiers
         start_date: Start datetime (timezone-aware)
         end_date: End datetime (timezone-aware)
     Returns:
-        Dictionary mapping agent identifier to list of PR metadata:
-        {
-            'agent-identifier': [
-                {
-                    'html_url': PR URL,
-                    'created_at': Creation timestamp,
-                    'merged_at': Merge timestamp (if merged, else None),
-                    'closed_at': Close timestamp (if closed but not merged, else None)
-                },
-                ...
-            ],
-            ...
-        }
     """
-    # Generate file path patterns for the time range
-    file_patterns = generate_file_path_patterns(start_date, end_date)
-    if not file_patterns:
-        print("   ✗ Error: No GHArchive data files found for the specified date range")
-        return {}
-    # Build identifier list for IN clause with proper escaping
     identifier_list = ', '.join([f"'{id}'" for id in identifiers])
-    # Build file patterns list for SQL (as JSON array string)
-    file_patterns_sql = '[' + ', '.join([f"'{fp}'" for fp in file_patterns]) + ']'
-    # ============================================================================
-    # REFINED DUCKDB QUERY - Using struct accessors for parsed JSON
-    # ============================================================================
-    query = f"""
-    WITH pr_events AS (
-    -- Get all PR opened/closed events
-    SELECT
-        CONCAT(
-        REPLACE(repo.url, 'api.github.com/repos/', 'github.com/'),
-        '/pull/',
-        CAST(payload.pull_request.number AS VARCHAR)
-        ) as url,
-        actor.login as pr_author,
-        created_at as event_time,
-        payload.action as event_action
-    FROM read_json({file_patterns_sql}, union_by_name=true, filename=true, compression='gzip', format='newline_delimited', ignore_errors=true, maximum_object_size=2147483648)
-    WHERE
-        type = 'PullRequestEvent'
-        AND payload.action IN ('opened', 'closed')
-        AND payload.pull_request.number IS NOT NULL
-        AND actor.login IN ({identifier_list})
-    ),
-    pr_timeline AS (
-    -- Build timeline: opened_at and closed_at (closed could mean merged or rejected)
-    SELECT
-        url,
-        pr_author,
-        MIN(CASE WHEN event_action = 'opened' THEN event_time END) as created_at,
-        MAX(CASE WHEN event_action = 'closed' THEN event_time END) as closed_at,
-        -- Note: GHArchive doesn't distinguish merged vs closed, so merged_at = NULL
-        NULL as merged_at
-    FROM pr_events
-    GROUP BY url, pr_author
-    )
-    SELECT
-    url,
-    pr_author,
-    created_at,
-    merged_at,
-    closed_at
-    FROM pr_timeline
-    WHERE created_at IS NOT NULL
-    ORDER BY created_at DESC
-    """
-    try:
-        # Execute the query
-        results = conn.execute(query).fetchall()
-        if not results:
-            print(f"   ⚠ Warning: Query returned 0 results")
-            print(f"   Checked {len(identifiers)} agent(s): {', '.join(identifiers)}")
-            return {}
-        # Group results by agent identifier
-        metadata_by_agent = defaultdict(list)
-        unique_urls = set()
-        for row in results:
-            url = row[0]
-            pr_author = row[1]
-            created_at = normalize_date_format(row[2]) if row[2] else None
-            merged_at = normalize_date_format(row[3]) if row[3] else None
-            closed_at = normalize_date_format(row[4]) if row[4] else None
-            # Skip if no valid URL
-            if not url:
-                continue
-            # Track unique URLs for verification
-            unique_urls.add(url)
-            # Build metadata record
-            pr_metadata = {
-                'html_url': url,
-                'created_at': created_at,
-                'merged_at': merged_at,
-                'closed_at': closed_at,
-            }
-            metadata_by_agent[pr_author].append(pr_metadata)
-        # Log results per agent
-        agents_with_data = sum(1 for prs in metadata_by_agent.values() if prs)
-        print(f"   ✓ Coverage: {agents_with_data}/{len(identifiers)} agents have PR data")
-        for agent_id in sorted(metadata_by_agent.keys()):
-            pr_count = len(metadata_by_agent[agent_id])
-            print(f"      - {agent_id}: {pr_count} PRs")
-        # Convert defaultdict to regular dict before returning
-        return dict(metadata_by_agent)
-    except Exception as e:
-        print(f"   ✗ DuckDB query error: {str(e)}")
-        import traceback
-        traceback.print_exc()
-        return {}
-# =============================================================================
-# HUGGINGFACE STORAGE FUNCTIONS WITH BATCH UPLOAD
-# =============================================================================
 def group_metadata_by_date(metadata_list):
-    """
-    Group PR metadata by date (year.month.day) for daily storage.
-    Returns dict: {(year, month, day): [metadata_list]}
-    """
     grouped = defaultdict(list)
     for pr_meta in metadata_list:
@@ -574,21 +488,7 @@ def group_metadata_by_date(metadata_list):
 def upload_single_file_with_retry(api, local_path, repo_path, repo_id, repo_type, commit_message, max_retries=MAX_RETRIES):
-    """
-    Upload a single file with exponential backoff retry logic.
-    Args:
-        api: HfApi instance
-        local_path: Local file path
-        repo_path: Path in repository
-        repo_id: Repository ID
-        repo_type: Repository type (e.g., "dataset")
-        commit_message: Commit message
-        max_retries: Maximum number of retries
-    Returns:
-        bool: True if successful, False otherwise
-    """
     for attempt in range(max_retries):
         try:
             upload_file_with_backoff(
@@ -602,7 +502,6 @@ def upload_single_file_with_retry(api, local_path, repo_path, repo_id, repo_type
             return True
         except Exception as e:
             if attempt < max_retries - 1:
-                # Calculate exponential backoff
                 wait_time = min(UPLOAD_INITIAL_BACKOFF * (2 ** attempt), UPLOAD_MAX_BACKOFF)
                 print(f"      {e} error on attempt {attempt + 1}/{max_retries}. Retrying in {wait_time}s...")
                 time.sleep(wait_time)
@@ -613,16 +512,7 @@ def upload_single_file_with_retry(api, local_path, repo_path, repo_id, repo_type
 def batch_upload_pr_metadata(all_metadata):
-    """
-    Upload PR metadata for all agents with time gaps between uploads.
-    Each agent's data is uploaded as separate daily files with retry logic.
-    Args:
-        all_metadata: Dictionary mapping agent identifier to list of PR metadata
-    Returns:
-        tuple: (success_count, error_count)
-    """
     try:
         token = get_hf_token()
         if not token:
@@ -634,7 +524,6 @@ def batch_upload_pr_metadata(all_metadata):
         error_count = 0
         total_files = 0
-        # First, calculate total number of files to upload
         for agent_identifier, metadata_list in all_metadata.items():
             if metadata_list:
                 grouped = group_metadata_by_date(metadata_list)
@@ -648,28 +537,21 @@ def batch_upload_pr_metadata(all_metadata):
             if not metadata_list:
                 continue
-            # Group by date
             grouped = group_metadata_by_date(metadata_list)
-            # Create temporary files for this agent
             agent_temp_dir = tempfile.mkdtemp()
             try:
-                # Prepare all files locally
                 local_files = []
                 for (pr_year, month, day), day_metadata in grouped.items():
                     filename = f"{pr_year}.{month:02d}.{day:02d}.jsonl"
                     local_path = os.path.join(agent_temp_dir, filename)
                     repo_path = f"{agent_identifier}/{filename}"
-                    # Sort by created_at for better organization
                     day_metadata.sort(key=lambda x: x.get('created_at', ''), reverse=True)
-                    # Save to temp file
                     save_jsonl(local_path, day_metadata)
                     local_files.append((local_path, repo_path, len(day_metadata)))
-                # Upload each file with delay
                 agent_success = 0
                 agent_error = 0
@@ -691,12 +573,10 @@ def batch_upload_pr_metadata(all_metadata):
                         agent_error += 1
                         error_count += 1
-                    # Add delay between uploads (except for last file)
                     if file_idx < len(local_files):
                         time.sleep(UPLOAD_DELAY_SECONDS)
             finally:
-                # Clean up temp directory
                 if os.path.exists(agent_temp_dir):
                     import shutil
                     shutil.rmtree(agent_temp_dir)
@@ -716,22 +596,14 @@ def batch_upload_pr_metadata(all_metadata):
 def load_agents_from_hf():
-    """
-    Load all agent metadata JSON files from HuggingFace dataset.
-    The github_identifier is extracted from the filename (e.g., 'agent-name[bot].json' -> 'agent-name[bot]')
-    """
     try:
         api = HfApi()
         agents = []
-        # List all files in the repository
         files = list_repo_files_with_backoff(api=api, repo_id=AGENTS_REPO, repo_type="dataset")
-        # Filter for JSON files only
         json_files = [f for f in files if f.endswith('.json')]
-        # Download and parse each JSON file
         for json_file in json_files:
             try:
                 file_path = hf_hub_download_with_backoff(
@@ -743,11 +615,9 @@ def load_agents_from_hf():
                 with open(file_path, 'r') as f:
                     agent_data = json.load(f)
-                    # Only process agents with status == "public"
                     if agent_data.get('status') != 'public':
                         continue
-                    # Extract github_identifier from filename (remove .json extension)
                     github_identifier = json_file.replace('.json', '')
                     agent_data['github_identifier'] = github_identifier
@@ -758,7 +628,6 @@ def load_agents_from_hf():
                 continue
         print(f"Download complete: {len(agents)} agents")
         return agents
     except Exception as e:
@@ -766,28 +635,14 @@ def load_agents_from_hf():
         return []
-# =============================================================================
-# LEADERBOARD DATA COMPUTATION & STORAGE
-# =============================================================================
 def calculate_pr_stats_from_metadata(metadata_list):
-    """
-    Calculate statistics from a list of PR metadata.
-    Returns a dictionary with comprehensive PR metrics.
-    Acceptance rate = merged PRs / (merged PRs + closed but not merged PRs) * 100
-    """
     total_prs = len(metadata_list)
     merged = sum(1 for pr_meta in metadata_list if pr_meta.get('merged_at'))
-    # Count closed PRs (rejected) - those with closed_at but no merged_at
     closed_not_merged = sum(1 for pr_meta in metadata_list
                            if pr_meta.get('closed_at') and not pr_meta.get('merged_at'))
-    # Total decisions made = merged + closed (rejected)
     total_decisions = merged + closed_not_merged
-    # Calculate acceptance rate based on decisions made
     acceptance_rate = (merged / total_decisions * 100) if total_decisions > 0 else 0
     return {
@@ -798,36 +653,14 @@ def calculate_pr_stats_from_metadata(metadata_list):
 def calculate_monthly_metrics_by_agent(all_metadata_dict, agents):
-    """
-    Calculate monthly metrics for all agents for visualization.
-    Args:
-        all_metadata_dict: Dictionary mapping agent identifier to list of PR metadata
-        agents: List of agent dictionaries with metadata
-    Returns:
-        dict: {
-            'agents': list of agent names,
-            'months': list of month labels (e.g., '2025-01'),
-            'data': {
-                agent_name: {
-                    'acceptance_rates': list of acceptance rates by month,
-                    'total_prs': list of PR counts by month,
-                    'merged_prs': list of merged PR counts by month,
-                }
-            }
-        }
-    """
-    # Create mapping from agent_identifier to agent_name
     identifier_to_name = {agent.get('github_identifier'): agent.get('name') for agent in agents if agent.get('github_identifier')}
     if not all_metadata_dict:
         return {'agents': [], 'months': [], 'data': {}}
-    # Group by agent and month
     agent_month_data = defaultdict(lambda: defaultdict(list))
-    # Flatten the dict of lists into a single list with agent_identifier added
     for agent_identifier, metadata_list in all_metadata_dict.items():
         for pr_meta in metadata_list:
             created_at = pr_meta.get('created_at')
@@ -835,7 +668,6 @@ def calculate_monthly_metrics_by_agent(all_metadata_dict, agents):
             if not created_at:
                 continue
-            # Get agent_name from identifier
             agent_name = identifier_to_name.get(agent_identifier, agent_identifier)
             try:
@@ -846,13 +678,11 @@ def calculate_monthly_metrics_by_agent(all_metadata_dict, agents):
                 print(f"Warning: Could not parse date '{created_at}': {e}")
                 continue
-    # Get all unique months and sort them
     all_months = set()
     for agent_data in agent_month_data.values():
         all_months.update(agent_data.keys())
     months = sorted(list(all_months))
-    # Calculate metrics for each agent and month
     result_data = {}
     for agent_name, month_dict in agent_month_data.items():
         acceptance_rates = []
@@ -863,17 +693,11 @@ def calculate_monthly_metrics_by_agent(all_metadata_dict, agents):
         for month in months:
             prs_in_month = month_dict.get(month, [])
-            # Count merged PRs
             merged_count = sum(1 for pr in prs_in_month if pr.get('merged_at'))
-            # Count closed but not merged
             closed_not_merged_count = sum(1 for pr in prs_in_month
                                          if pr.get('closed_at') and not pr.get('merged_at'))
-            # Total PRs created in this month
             total_count = len(prs_in_month)
-            # Calculate acceptance rate
             total_decisions = merged_count + closed_not_merged_count
             acceptance_rate = (merged_count / total_decisions * 100) if total_decisions > 0 else None
@@ -899,16 +723,7 @@ def calculate_monthly_metrics_by_agent(all_metadata_dict, agents):
 def construct_leaderboard_from_metadata(all_metadata_dict, agents):
-    """
-    Construct leaderboard from in-memory PR metadata.
-    Args:
-        all_metadata_dict: Dictionary mapping agent identifier to list of PR metadata
-        agents: List of agent dictionaries with metadata
-    Returns:
-        Dictionary of agent stats.
-    """
     if not agents:
         print("Error: No agents found")
         return {}
@@ -919,10 +734,7 @@ def construct_leaderboard_from_metadata(all_metadata_dict, agents):
         identifier = agent.get('github_identifier')
         agent_name = agent.get('name', 'Unknown')
-        # Get metadata for this agent from the dictionary
         bot_metadata = all_metadata_dict.get(identifier, [])
-        # Calculate stats
         stats = calculate_pr_stats_from_metadata(bot_metadata)
         cache_dict[identifier] = {
@@ -936,16 +748,7 @@ def construct_leaderboard_from_metadata(all_metadata_dict, agents):
 def save_leaderboard_data_to_hf(leaderboard_dict, monthly_metrics):
-    """
-    Save leaderboard data and monthly metrics to HuggingFace dataset as swe-pr.json.
-    Args:
-        leaderboard_dict: Dictionary of agent stats from construct_leaderboard_from_metadata()
-        monthly_metrics: Monthly metrics data from calculate_monthly_metrics_by_agent()
-    Returns:
-        bool: True if successful, False otherwise
-    """
     try:
         token = get_hf_token()
         if not token:
@@ -954,7 +757,6 @@ def save_leaderboard_data_to_hf(leaderboard_dict, monthly_metrics):
         api = HfApi(token=token)
         filename = "swe-pr.json"
-        # Combine leaderboard and monthly metrics
         combined_data = {
             'last_updated': datetime.now(timezone.utc).isoformat(),
             'leaderboard': leaderboard_dict,
@@ -964,12 +766,10 @@ def save_leaderboard_data_to_hf(leaderboard_dict, monthly_metrics):
             }
         }
-        # Save locally first
         with open(filename, 'w') as f:
             json.dump(combined_data, f, indent=2)
         try:
-            # Upload to HuggingFace with retry logic
             upload_file_with_backoff(
                 api=api,
                 path_or_fileobj=filename,
@@ -979,7 +779,6 @@ def save_leaderboard_data_to_hf(leaderboard_dict, monthly_metrics):
             )
             return True
         finally:
-            # Always clean up local file
             if os.path.exists(filename):
                 os.remove(filename)
@@ -991,21 +790,19 @@ def save_leaderboard_data_to_hf(leaderboard_dict, monthly_metrics):
 # =============================================================================
-# MAIN MINING FUNCTION
 # =============================================================================
 def mine_all_agents():
     """
-    Mine PR metadata for all agents within LEADERBOARD_TIME_FRAME_DAYS and save to HuggingFace.
-    Downloads GHArchive data first, then uses ONE DuckDB query for ALL agents, then batch uploads with time gaps.
     """
-    # Step 1: Download GHArchive data
     print(f"\n[1/5] Downloading GHArchive data...")
     if not download_all_gharchive_data():
         print("Warning: Download had errors, continuing with available data...")
-    # Step 2: Load agent metadata from HuggingFace
     print(f"\n[2/5] Loading agent metadata...")
     agents = load_agents_from_hf()
@@ -1013,7 +810,6 @@ def mine_all_agents():
         print("Error: No agents found")
         return
-    # Extract all identifiers
     identifiers = [agent['github_identifier'] for agent in agents if agent.get('github_identifier')]
     if not identifiers:
         print("Error: No valid agent identifiers found")
@@ -1021,55 +817,42 @@ def mine_all_agents():
     print(f"\n[3/5] Mining PR metadata ({len(identifiers)} agents, {LEADERBOARD_TIME_FRAME_DAYS} days)...")
-    # Initialize DuckDB connection
     try:
         conn = get_duckdb_connection()
     except Exception as e:
         print(f"Failed to initialize DuckDB connection: {str(e)}")
         return
-    # Define time range: past LEADERBOARD_TIME_FRAME_DAYS (excluding today)
     current_time = datetime.now(timezone.utc)
     end_date = current_time.replace(hour=0, minute=0, second=0, microsecond=0)
     start_date = end_date - timedelta(days=LEADERBOARD_TIME_FRAME_DAYS)
     try:
-        # Use single query for all agents
-        all_metadata = fetch_all_pr_metadata_single_query(
             conn, identifiers, start_date, end_date
         )
-        # Calculate summary statistics
         total_prs = sum(len(metadata_list) for metadata_list in all_metadata.values())
         agents_with_data = sum(1 for metadata_list in all_metadata.values() if metadata_list)
-        print(f"Query complete: {total_prs} PRs found for {agents_with_data}/{len(agents)} agents")
     except Exception as e:
         print(f"Error during DuckDB fetch: {str(e)}")
         import traceback
         traceback.print_exc()
         return
     finally:
-        # Close DuckDB connection
         conn.close()
-    # Step 4: Batch upload PR metadata with time gaps
     print(f"\n[4/5] Uploading PR metadata...")
     success_count, error_count = batch_upload_pr_metadata(all_metadata)
-    # Step 5: Construct and save leaderboard data
     print(f"\n[5/5] Saving leaderboard...")
     try:
-        # Construct leaderboard from in-memory data
         leaderboard_dict = construct_leaderboard_from_metadata(all_metadata, agents)
-        # Calculate monthly metrics from in-memory data
         monthly_metrics = calculate_monthly_metrics_by_agent(all_metadata, agents)
-        # Save to HuggingFace
         save_leaderboard_data_to_hf(leaderboard_dict, monthly_metrics)
         print(f"\nCOMPLETE: {success_count} files uploaded" + (f", {error_count} errors" if error_count > 0 else ""))
@@ -1085,30 +868,16 @@ def mine_all_agents():
 # =============================================================================
 def setup_scheduler():
-    """
-    Set up APScheduler to run mining jobs periodically.
-    Schedule is configurable via environment variables.
-    Environment variables:
-    - SCHEDULE_ENABLED: Enable/disable scheduler (default: true)
-    - SCHEDULE_DAY_OF_MONTH: Day of month to run (default: 8, second week)
-    - SCHEDULE_HOUR: Hour to run (0-23, default: 0)
-    - SCHEDULE_MINUTE: Minute to run (0-59, default: 0)
-    - SCHEDULE_TIMEZONE: Timezone for scheduling (default: UTC)
-    """
-    # Configure logging for APScheduler
     logging.basicConfig(
         level=logging.INFO,
         format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
     )
-    # Disable verbose HTTP request logging from httpx (used by huggingface_hub)
     logging.getLogger('httpx').setLevel(logging.WARNING)
-    # Create scheduler
     scheduler = BlockingScheduler(timezone=SCHEDULE_TIMEZONE)
-    # Create cron trigger with configured schedule (monthly on specific day)
     trigger = CronTrigger(
         day=SCHEDULE_DAY_OF_MONTH,
         hour=SCHEDULE_HOUR,
@@ -1116,7 +885,6 @@ def setup_scheduler():
         timezone=SCHEDULE_TIMEZONE
     )
-    # Add job to scheduler
     scheduler.add_job(
         mine_all_agents,
         trigger=trigger,
@@ -1125,13 +893,11 @@ def setup_scheduler():
         replace_existing=True
     )
-    # Print schedule information
     from datetime import datetime
     next_run = trigger.get_next_fire_time(None, datetime.now(trigger.timezone))
     print(f"Scheduler: Monthly on day {SCHEDULE_DAY_OF_MONTH} at {SCHEDULE_HOUR:02d}:{SCHEDULE_MINUTE:02d} {SCHEDULE_TIMEZONE}")
     print(f"Next run: {next_run}\n")
-    # Start scheduler (blocking call)
     print(f"\nScheduler started")
     scheduler.start()
@@ -1142,8 +908,6 @@ def setup_scheduler():
 if __name__ == "__main__":
     if SCHEDULE_ENABLED:
-        # Run with scheduler
         setup_scheduler()
     else:
-        # Run without scheduler, just mine once
-        mine_all_agents()

 import json
 import os
 import time
 AGENTS_REPO = "SWE-Arena/bot_metadata"
 PR_METADATA_REPO = "SWE-Arena/pr_metadata"
+LEADERBOARD_REPO = "SWE-Arena/leaderboard_metadata"
+LEADERBOARD_TIME_FRAME_DAYS = 180
+GHARCHIVE_DATA_DIR = "../gharchive/data"
+DUCKDB_CACHE_FILE = "cache.duckdb"
+# OPTIMIZED DUCKDB CONFIGURATION
+DUCKDB_THREADS = 8
+DUCKDB_MEMORY_LIMIT = "64GB"
+# Streaming batch configuration
+BATCH_SIZE_DAYS = 7  # Process 1 week at a time (~168 hourly files)
+# At this size: ~7 days × 24 files × ~100MB per file = ~16GB uncompressed per batch
 # Download configuration
+DOWNLOAD_WORKERS = 4
+DOWNLOAD_RETRY_DELAY = 2
+MAX_RETRIES = 5
 # Upload configuration
+UPLOAD_DELAY_SECONDS = 5
+UPLOAD_INITIAL_BACKOFF = 60
+UPLOAD_MAX_BACKOFF = 3600
 # Scheduler configuration
+SCHEDULE_ENABLED = False
+SCHEDULE_DAY_OF_MONTH = 8
+SCHEDULE_HOUR = 0
+SCHEDULE_MINUTE = 0
+SCHEDULE_TIMEZONE = 'UTC'
 # =============================================================================
 # UTILITY FUNCTIONS
 def normalize_date_format(date_string):
+    """Convert date strings or datetime objects to standardized ISO 8601 format with Z suffix."""
     if not date_string or date_string == 'N/A':
         return 'N/A'
     try:
         import re
         if isinstance(date_string, datetime):
             return date_string.strftime('%Y-%m-%dT%H:%M:%SZ')
         date_string = re.sub(r'\s+', ' ', date_string.strip())
         date_string = date_string.replace(' ', 'T')
         if len(date_string) >= 3:
             if date_string[-3:-2] in ('+', '-') and ':' not in date_string[-3:]:
                 date_string = date_string + ':00'
         dt = datetime.fromisoformat(date_string.replace('Z', '+00:00'))
         return dt.strftime('%Y-%m-%dT%H:%M:%SZ')
     except Exception as e:
         print(f"Warning: Could not parse date '{date_string}': {e}")
 # =============================================================================
 def download_file(url):
+    """Download a GHArchive file with retry logic."""
     filename = url.split("/")[-1]
     filepath = os.path.join(GHARCHIVE_DATA_DIR, filename)
     if os.path.exists(filepath):
         return True
     for attempt in range(MAX_RETRIES):
         try:
             response = requests.get(url, timeout=30)
         except requests.exceptions.HTTPError as e:
             if e.response.status_code == 404:
                 return False
             else:
                 if attempt < MAX_RETRIES - 1:
+                    wait_time = DOWNLOAD_RETRY_DELAY * (2 ** attempt)
                     print(f"   ⚠ {filename}: HTTP error {e.response.status_code}, retrying in {wait_time}s (attempt {attempt + 1}/{MAX_RETRIES})")
                     time.sleep(wait_time)
                 else:
         except (requests.exceptions.Timeout,
                 requests.exceptions.ConnectionError,
                 requests.exceptions.ReadTimeout) as e:
             if attempt < MAX_RETRIES - 1:
+                wait_time = DOWNLOAD_RETRY_DELAY * (2 ** attempt)
                 print(f"   ⚠ {filename}: {type(e).__name__}, retrying in {wait_time}s (attempt {attempt + 1}/{MAX_RETRIES})")
                 time.sleep(wait_time)
             else:
                 print(f"   ✗ {filename}: Failed after {MAX_RETRIES} attempts - {type(e).__name__}")
         except Exception as e:
             if attempt < MAX_RETRIES - 1:
                 wait_time = DOWNLOAD_RETRY_DELAY * (2 ** attempt)
                 print(f"   ⚠ {filename}: {e}, retrying in {wait_time}s (attempt {attempt + 1}/{MAX_RETRIES})")
 def download_all_gharchive_data():
+    """Download all GHArchive data files for the last LEADERBOARD_TIME_FRAME_DAYS."""
     os.makedirs(GHARCHIVE_DATA_DIR, exist_ok=True)
     end_date = datetime.now()
     start_date = end_date - timedelta(days=LEADERBOARD_TIME_FRAME_DAYS)
     current_date = start_date
     while current_date <= end_date:
         date_str = current_date.strftime("%Y-%m-%d")
         for hour in range(24):
             url = f"https://data.gharchive.org/{date_str}-{hour}.json.gz"
             urls.append(url)
     try:
         with ThreadPoolExecutor(max_workers=DOWNLOAD_WORKERS) as executor:
             futures = [executor.submit(download_file, url) for url in urls]
             for future in as_completed(futures):
                 downloads_processed += 1
 # =============================================================================
+# HUGGINGFACE API WRAPPERS
 # =============================================================================
 def is_retryable_error(e):
+    """Check if exception is retryable (rate limit or timeout error)."""
     if isinstance(e, HfHubHTTPError):
         if e.response.status_code == 429:
             return True
     if isinstance(e, (requests.exceptions.Timeout,
                      requests.exceptions.ReadTimeout,
                      requests.exceptions.ConnectTimeout)):
         return True
     if isinstance(e, Exception):
         error_str = str(e).lower()
         if 'timeout' in error_str or 'timed out' in error_str:
     )
 )
 def list_repo_files_with_backoff(api, **kwargs):
+    """Wrapper for api.list_repo_files() with exponential backoff."""
     return api.list_repo_files(**kwargs)
     )
 )
 def hf_hub_download_with_backoff(**kwargs):
+    """Wrapper for hf_hub_download() with exponential backoff."""
     return hf_hub_download(**kwargs)
     )
 )
 def upload_file_with_backoff(api, **kwargs):
+    """Wrapper for api.upload_file() with exponential backoff."""
     return api.upload_file(**kwargs)
     )
 )
 def upload_folder_with_backoff(api, **kwargs):
+    """Wrapper for api.upload_folder() with exponential backoff."""
     return api.upload_folder(**kwargs)
 def get_duckdb_connection():
     """
+    Initialize DuckDB connection with OPTIMIZED memory settings.
+    Uses persistent database and reduced memory footprint.
     """
     conn = duckdb.connect(DUCKDB_CACHE_FILE)
+    # OPTIMIZED SETTINGS
+    conn.execute(f"SET threads TO {DUCKDB_THREADS};")
+    conn.execute("SET preserve_insertion_order = false;")
+    conn.execute("SET enable_object_cache = true;")
+    conn.execute("SET temp_directory = '/tmp/duckdb_temp';")
+    conn.execute(f"SET memory_limit = '{DUCKDB_MEMORY_LIMIT}';")  # Per-query limit
+    conn.execute(f"SET max_memory = '{DUCKDB_MEMORY_LIMIT}';")  # Hard cap
     return conn
 def generate_file_path_patterns(start_date, end_date, data_dir=GHARCHIVE_DATA_DIR):
+    """Generate file path patterns for GHArchive data in date range (only existing files)."""
     file_patterns = []
     missing_dates = set()
     end_day = end_date.replace(hour=0, minute=0, second=0, microsecond=0)
     while current_date <= end_day:
         date_has_files = False
         for hour in range(24):
             pattern = os.path.join(data_dir, f"{current_date.strftime('%Y-%m-%d')}-{hour}.json.gz")
             if os.path.exists(pattern):
                 file_patterns.append(pattern)
                 date_has_files = True
         if not date_has_files:
             missing_dates.add(current_date.strftime('%Y-%m-%d'))
         current_date += timedelta(days=1)
     if missing_dates:
+        print(f"   ⚠ Skipping {len(missing_dates)} date(s) with no data")
     return file_patterns
 # =============================================================================
+# STREAMING BATCH PROCESSING
 # =============================================================================
+def fetch_all_pr_metadata_streaming(conn, identifiers, start_date, end_date):
+    """
+    OPTIMIZED: Fetch PR metadata using streaming batch processing.
+    Processes GHArchive files in BATCH_SIZE_DAYS chunks to limit memory usage.
+    Instead of loading 180 days (4,344 files) at once, processes 7 days at a time.
+    This prevents OOM errors by:
+    1. Only keeping ~168 hourly files in memory per batch (vs 4,344)
+    2. Incrementally building the results dictionary
+    3. Allowing DuckDB to garbage collect after each batch
     Args:
         conn: DuckDB connection instance
+        identifiers: List of GitHub usernames/bot identifiers (~1500)
         start_date: Start datetime (timezone-aware)
         end_date: End datetime (timezone-aware)
     Returns:
+        Dictionary mapping agent identifier to list of PR metadata
     """
     identifier_list = ', '.join([f"'{id}'" for id in identifiers])
+    metadata_by_agent = defaultdict(list)
+    # Calculate total batches
+    total_days = (end_date - start_date).days
+    total_batches = (total_days // BATCH_SIZE_DAYS) + 1
+    # Process in configurable batches
+    current_date = start_date
+    batch_num = 0
+    total_prs = 0
+    print(f"   Streaming {total_batches} batches of {BATCH_SIZE_DAYS}-day intervals...")
+    while current_date <= end_date:
+        batch_num += 1
+        batch_end = min(current_date + timedelta(days=BATCH_SIZE_DAYS - 1), end_date)
+        # Get file patterns for THIS BATCH ONLY (not all 180 days)
+        file_patterns = generate_file_path_patterns(current_date, batch_end)
+        if not file_patterns:
+            print(f"   Batch {batch_num}/{total_batches}: {current_date.date()} to {batch_end.date()} - NO DATA")
+            current_date = batch_end + timedelta(days=1)
+            continue
+        # Progress indicator
+        print(f"   Batch {batch_num}/{total_batches}: {current_date.date()} to {batch_end.date()} ({len(file_patterns)} files)... ", end="", flush=True)
+        # Build file patterns SQL for THIS BATCH
+        file_patterns_sql = '[' + ', '.join([f"'{fp}'" for fp in file_patterns]) + ']'
+        # Query for this batch
+        query = f"""
+        WITH pr_events AS (
+            SELECT
+                CONCAT(
+                    REPLACE(repo.url, 'api.github.com/repos/', 'github.com/'),
+                    '/pull/',
+                    CAST(payload.pull_request.number AS VARCHAR)
+                ) as url,
+                actor.login as pr_author,
+                created_at as event_time,
+                payload.action as event_action
+            FROM read_json({file_patterns_sql}, union_by_name=true, filename=true, compression='gzip', format='newline_delimited', ignore_errors=true, maximum_object_size=2147483648)
+            WHERE
+                type = 'PullRequestEvent'
+                AND payload.action IN ('opened', 'closed')
+                AND payload.pull_request.number IS NOT NULL
+                AND actor.login IN ({identifier_list})
+        ),
+        pr_timeline AS (
+            SELECT
+                url,
+                pr_author,
+                MIN(CASE WHEN event_action = 'opened' THEN event_time END) as created_at,
+                MAX(CASE WHEN event_action = 'closed' THEN event_time END) as closed_at,
+                NULL as merged_at
+            FROM pr_events
+            GROUP BY url, pr_author
+        )
+        SELECT url, pr_author, created_at, merged_at, closed_at
+        FROM pr_timeline
+        WHERE created_at IS NOT NULL
+        """
+        try:
+            results = conn.execute(query).fetchall()
+            batch_prs = 0
+            # Add results to accumulating dictionary
+            for row in results:
+                url = row[0]
+                pr_author = row[1]
+                created_at = normalize_date_format(row[2]) if row[2] else None
+                merged_at = normalize_date_format(row[3]) if row[3] else None
+                closed_at = normalize_date_format(row[4]) if row[4] else None
+                if not url:
+                    continue
+                pr_metadata = {
+                    'html_url': url,
+                    'created_at': created_at,
+                    'merged_at': merged_at,
+                    'closed_at': closed_at,
+                }
+                metadata_by_agent[pr_author].append(pr_metadata)
+                batch_prs += 1
+                total_prs += 1
+            print(f"✓ {batch_prs} PRs found")
+        except Exception as e:
+            print(f"\n   ✗ Batch {batch_num} error: {str(e)}")
+            import traceback
+            traceback.print_exc()
+        # Move to next batch
+        current_date = batch_end + timedelta(days=1)
+    # Final summary
+    agents_with_data = sum(1 for prs in metadata_by_agent.values() if prs)
+    print(f"\n   ✓ Complete: {total_prs} PRs found for {agents_with_data}/{len(identifiers)} agents")
+    return dict(metadata_by_agent)
 def group_metadata_by_date(metadata_list):
+    """Group PR metadata by date for daily storage."""
     grouped = defaultdict(list)
     for pr_meta in metadata_list:
 def upload_single_file_with_retry(api, local_path, repo_path, repo_id, repo_type, commit_message, max_retries=MAX_RETRIES):
+    """Upload a single file with exponential backoff retry logic."""
     for attempt in range(max_retries):
         try:
             upload_file_with_backoff(
             return True
         except Exception as e:
             if attempt < max_retries - 1:
                 wait_time = min(UPLOAD_INITIAL_BACKOFF * (2 ** attempt), UPLOAD_MAX_BACKOFF)
                 print(f"      {e} error on attempt {attempt + 1}/{max_retries}. Retrying in {wait_time}s...")
                 time.sleep(wait_time)
 def batch_upload_pr_metadata(all_metadata):
+    """Upload PR metadata for all agents with time gaps between uploads."""
     try:
         token = get_hf_token()
         if not token:
         error_count = 0
         total_files = 0
         for agent_identifier, metadata_list in all_metadata.items():
             if metadata_list:
                 grouped = group_metadata_by_date(metadata_list)
             if not metadata_list:
                 continue
             grouped = group_metadata_by_date(metadata_list)
             agent_temp_dir = tempfile.mkdtemp()
             try:
                 local_files = []
                 for (pr_year, month, day), day_metadata in grouped.items():
                     filename = f"{pr_year}.{month:02d}.{day:02d}.jsonl"
                     local_path = os.path.join(agent_temp_dir, filename)
                     repo_path = f"{agent_identifier}/{filename}"
                     day_metadata.sort(key=lambda x: x.get('created_at', ''), reverse=True)
                     save_jsonl(local_path, day_metadata)
                     local_files.append((local_path, repo_path, len(day_metadata)))
                 agent_success = 0
                 agent_error = 0
                         agent_error += 1
                         error_count += 1
                     if file_idx < len(local_files):
                         time.sleep(UPLOAD_DELAY_SECONDS)
             finally:
                 if os.path.exists(agent_temp_dir):
                     import shutil
                     shutil.rmtree(agent_temp_dir)
 def load_agents_from_hf():
+    """Load all agent metadata JSON files from HuggingFace dataset."""
     try:
         api = HfApi()
         agents = []
         files = list_repo_files_with_backoff(api=api, repo_id=AGENTS_REPO, repo_type="dataset")
         json_files = [f for f in files if f.endswith('.json')]
         for json_file in json_files:
             try:
                 file_path = hf_hub_download_with_backoff(
                 with open(file_path, 'r') as f:
                     agent_data = json.load(f)
                     if agent_data.get('status') != 'public':
                         continue
                     github_identifier = json_file.replace('.json', '')
                     agent_data['github_identifier'] = github_identifier
                 continue
         print(f"Download complete: {len(agents)} agents")
         return agents
     except Exception as e:
         return []
 def calculate_pr_stats_from_metadata(metadata_list):
+    """Calculate statistics from a list of PR metadata."""
     total_prs = len(metadata_list)
     merged = sum(1 for pr_meta in metadata_list if pr_meta.get('merged_at'))
     closed_not_merged = sum(1 for pr_meta in metadata_list
                            if pr_meta.get('closed_at') and not pr_meta.get('merged_at'))
     total_decisions = merged + closed_not_merged
     acceptance_rate = (merged / total_decisions * 100) if total_decisions > 0 else 0
     return {
 def calculate_monthly_metrics_by_agent(all_metadata_dict, agents):
+    """Calculate monthly metrics for all agents for visualization."""
     identifier_to_name = {agent.get('github_identifier'): agent.get('name') for agent in agents if agent.get('github_identifier')}
     if not all_metadata_dict:
         return {'agents': [], 'months': [], 'data': {}}
     agent_month_data = defaultdict(lambda: defaultdict(list))
     for agent_identifier, metadata_list in all_metadata_dict.items():
         for pr_meta in metadata_list:
             created_at = pr_meta.get('created_at')
             if not created_at:
                 continue
             agent_name = identifier_to_name.get(agent_identifier, agent_identifier)
             try:
                 print(f"Warning: Could not parse date '{created_at}': {e}")
                 continue
     all_months = set()
     for agent_data in agent_month_data.values():
         all_months.update(agent_data.keys())
     months = sorted(list(all_months))
     result_data = {}
     for agent_name, month_dict in agent_month_data.items():
         acceptance_rates = []
         for month in months:
             prs_in_month = month_dict.get(month, [])
             merged_count = sum(1 for pr in prs_in_month if pr.get('merged_at'))
             closed_not_merged_count = sum(1 for pr in prs_in_month
                                          if pr.get('closed_at') and not pr.get('merged_at'))
             total_count = len(prs_in_month)
             total_decisions = merged_count + closed_not_merged_count
             acceptance_rate = (merged_count / total_decisions * 100) if total_decisions > 0 else None
 def construct_leaderboard_from_metadata(all_metadata_dict, agents):
+    """Construct leaderboard from in-memory PR metadata."""
     if not agents:
         print("Error: No agents found")
         return {}
         identifier = agent.get('github_identifier')
         agent_name = agent.get('name', 'Unknown')
         bot_metadata = all_metadata_dict.get(identifier, [])
         stats = calculate_pr_stats_from_metadata(bot_metadata)
         cache_dict[identifier] = {
 def save_leaderboard_data_to_hf(leaderboard_dict, monthly_metrics):
+    """Save leaderboard data and monthly metrics to HuggingFace dataset."""
     try:
         token = get_hf_token()
         if not token:
         api = HfApi(token=token)
         filename = "swe-pr.json"
         combined_data = {
             'last_updated': datetime.now(timezone.utc).isoformat(),
             'leaderboard': leaderboard_dict,
             }
         }
         with open(filename, 'w') as f:
             json.dump(combined_data, f, indent=2)
         try:
             upload_file_with_backoff(
                 api=api,
                 path_or_fileobj=filename,
             )
             return True
         finally:
             if os.path.exists(filename):
                 os.remove(filename)
 # =============================================================================
+# MINING FUNCTION
 # =============================================================================
 def mine_all_agents():
     """
+    Mine PR metadata for all agents using STREAMING batch processing.
+    Downloads GHArchive data, then uses BATCH-based DuckDB queries.
     """
     print(f"\n[1/5] Downloading GHArchive data...")
     if not download_all_gharchive_data():
         print("Warning: Download had errors, continuing with available data...")
     print(f"\n[2/5] Loading agent metadata...")
     agents = load_agents_from_hf()
         print("Error: No agents found")
         return
     identifiers = [agent['github_identifier'] for agent in agents if agent.get('github_identifier')]
     if not identifiers:
         print("Error: No valid agent identifiers found")
     print(f"\n[3/5] Mining PR metadata ({len(identifiers)} agents, {LEADERBOARD_TIME_FRAME_DAYS} days)...")
     try:
         conn = get_duckdb_connection()
     except Exception as e:
         print(f"Failed to initialize DuckDB connection: {str(e)}")
         return
     current_time = datetime.now(timezone.utc)
     end_date = current_time.replace(hour=0, minute=0, second=0, microsecond=0)
     start_date = end_date - timedelta(days=LEADERBOARD_TIME_FRAME_DAYS)
     try:
+        # USE STREAMING FUNCTION INSTEAD
+        all_metadata = fetch_all_pr_metadata_streaming(
             conn, identifiers, start_date, end_date
         )
         total_prs = sum(len(metadata_list) for metadata_list in all_metadata.values())
         agents_with_data = sum(1 for metadata_list in all_metadata.values() if metadata_list)
     except Exception as e:
         print(f"Error during DuckDB fetch: {str(e)}")
         import traceback
         traceback.print_exc()
         return
     finally:
         conn.close()
     print(f"\n[4/5] Uploading PR metadata...")
     success_count, error_count = batch_upload_pr_metadata(all_metadata)
     print(f"\n[5/5] Saving leaderboard...")
     try:
         leaderboard_dict = construct_leaderboard_from_metadata(all_metadata, agents)
         monthly_metrics = calculate_monthly_metrics_by_agent(all_metadata, agents)
         save_leaderboard_data_to_hf(leaderboard_dict, monthly_metrics)
         print(f"\nCOMPLETE: {success_count} files uploaded" + (f", {error_count} errors" if error_count > 0 else ""))
 # =============================================================================
 def setup_scheduler():
+    """Set up APScheduler to run mining jobs periodically."""
     logging.basicConfig(
         level=logging.INFO,
         format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
     )
     logging.getLogger('httpx').setLevel(logging.WARNING)
     scheduler = BlockingScheduler(timezone=SCHEDULE_TIMEZONE)
     trigger = CronTrigger(
         day=SCHEDULE_DAY_OF_MONTH,
         hour=SCHEDULE_HOUR,
         timezone=SCHEDULE_TIMEZONE
     )
     scheduler.add_job(
         mine_all_agents,
         trigger=trigger,
         replace_existing=True
     )
     from datetime import datetime
     next_run = trigger.get_next_fire_time(None, datetime.now(trigger.timezone))
     print(f"Scheduler: Monthly on day {SCHEDULE_DAY_OF_MONTH} at {SCHEDULE_HOUR:02d}:{SCHEDULE_MINUTE:02d} {SCHEDULE_TIMEZONE}")
     print(f"Next run: {next_run}\n")
     print(f"\nScheduler started")
     scheduler.start()
 if __name__ == "__main__":
     if SCHEDULE_ENABLED:
         setup_scheduler()
     else:
+        mine_all_agents()