Spaces:

SWE-Arena
/

SWE-Review

Sleeping

App Files Files Community

zhimin-z commited on Nov 15

Commit

f5d00b4

1 Parent(s): c3011cc

refine

Browse files

Files changed (1) hide show

msr.py +184 -469

msr.py CHANGED Viewed

@@ -25,31 +25,35 @@ load_dotenv()
 AGENTS_REPO = "SWE-Arena/bot_metadata"
 REVIEW_METADATA_REPO = "SWE-Arena/review_metadata"
-LEADERBOARD_REPO = "SWE-Arena/leaderboard_metadata"  # HuggingFace dataset for leaderboard data
-LEADERBOARD_TIME_FRAME_DAYS = 180  # Time frame for leaderboard
-GHARCHIVE_DATA_DIR = "../gharchive/data"  # Local GHArchive data directory
-DUCKDB_CACHE_FILE = "cache.duckdb"  # Persistent DuckDB database for caching
-# DuckDB performance configuration
-DUCKDB_THREADS = 8  # Number of threads for parallel processing
-DUCKDB_MEMORY_LIMIT = "64GB"  # Memory limit to prevent OOM crashes
 # Download configuration
-DOWNLOAD_WORKERS = 4  # Number of parallel download threads
-DOWNLOAD_RETRY_DELAY = 2  # Initial retry delay in seconds
-MAX_RETRIES = 5  # Maximum number of retries for each API call
 # Upload configuration
-UPLOAD_DELAY_SECONDS = 5  # Delay between individual file uploads to avoid rate limits
-UPLOAD_INITIAL_BACKOFF = 60  # Initial backoff time in seconds (1 minute)
-UPLOAD_MAX_BACKOFF = 3600  # Maximum backoff time in seconds (60 minutes)
 # Scheduler configuration
-SCHEDULE_ENABLED = True  # Enable/disable scheduler
-SCHEDULE_DAY_OF_MONTH = 22  # Day of month (1-31) - 22nd is in the fourth week
-SCHEDULE_HOUR = 0  # Hour (0-23) - 12am midnight
-SCHEDULE_MINUTE = 0  # Minute (0-59)
-SCHEDULE_TIMEZONE = 'UTC'  # Timezone for scheduling
 # =============================================================================
 # UTILITY FUNCTIONS
@@ -80,34 +84,24 @@ def save_jsonl(filename, data):
 def normalize_date_format(date_string):
-    """
-    Convert date strings to standardized ISO 8601 format with Z suffix.
-    Handles both 'T' and space-separated datetime formats (including newlines).
-    Examples:
-    - 2025-10-15T23:23:47.983068 -> 2025-10-15T23:23:47Z
-    - 2025-06-17 21:21:07+00 -> 2025-06-17T21:21:07Z
-    """
     if not date_string or date_string == 'N/A':
         return 'N/A'
     try:
         import re
-        # Remove all whitespace (spaces, newlines, tabs) and replace with single space
-        date_string = re.sub(r'\s+', ' ', date_string.strip())
-        # Replace space with 'T' for ISO format compatibility
         date_string = date_string.replace(' ', 'T')
-        # Fix incomplete timezone offset (+00 or -00 -> +00:00 or -00:00)
-        # Check if timezone offset exists and is incomplete
         if len(date_string) >= 3:
             if date_string[-3:-2] in ('+', '-') and ':' not in date_string[-3:]:
                 date_string = date_string + ':00'
-        # Parse the date string (handles both with and without microseconds)
         dt = datetime.fromisoformat(date_string.replace('Z', '+00:00'))
-        # Convert to standardized format
         return dt.strftime('%Y-%m-%dT%H:%M:%SZ')
     except Exception as e:
         print(f"Warning: Could not parse date '{date_string}': {e}")
@@ -127,23 +121,13 @@ def get_hf_token():
 # =============================================================================
 def download_file(url):
-    """
-    Download a GHArchive file with retry logic.
-    Args:
-        url: URL to download
-    Returns:
-        bool: True if successful, False otherwise
-    """
     filename = url.split("/")[-1]
     filepath = os.path.join(GHARCHIVE_DATA_DIR, filename)
-    # Skip if json.gz already exists
     if os.path.exists(filepath):
         return True
-    # Download with retry logic
     for attempt in range(MAX_RETRIES):
         try:
             response = requests.get(url, timeout=30)
@@ -154,12 +138,10 @@ def download_file(url):
         except requests.exceptions.HTTPError as e:
             if e.response.status_code == 404:
-                # File doesn't exist, don't retry
                 return False
             else:
-                # Other HTTP errors, retry
                 if attempt < MAX_RETRIES - 1:
-                    wait_time = DOWNLOAD_RETRY_DELAY * (2 ** attempt)  # Exponential backoff
                     print(f"   ⚠ {filename}: HTTP error {e.response.status_code}, retrying in {wait_time}s (attempt {attempt + 1}/{MAX_RETRIES})")
                     time.sleep(wait_time)
                 else:
@@ -168,16 +150,14 @@ def download_file(url):
         except (requests.exceptions.Timeout,
                 requests.exceptions.ConnectionError,
                 requests.exceptions.ReadTimeout) as e:
-            # Timeout/connection errors, retry
             if attempt < MAX_RETRIES - 1:
-                wait_time = DOWNLOAD_RETRY_DELAY * (2 ** attempt)  # Exponential backoff
                 print(f"   ⚠ {filename}: {type(e).__name__}, retrying in {wait_time}s (attempt {attempt + 1}/{MAX_RETRIES})")
                 time.sleep(wait_time)
             else:
                 print(f"   ✗ {filename}: Failed after {MAX_RETRIES} attempts - {type(e).__name__}")
         except Exception as e:
-            # Other errors, retry
             if attempt < MAX_RETRIES - 1:
                 wait_time = DOWNLOAD_RETRY_DELAY * (2 ** attempt)
                 print(f"   ⚠ {filename}: {e}, retrying in {wait_time}s (attempt {attempt + 1}/{MAX_RETRIES})")
@@ -189,17 +169,9 @@ def download_file(url):
 def download_all_gharchive_data():
-    """
-    Download all GHArchive data files for the last LEADERBOARD_TIME_FRAME_DAYS.
-    Uses parallel downloads with ThreadPoolExecutor.
-    Returns:
-        bool: True if all downloads completed (some may have failed), False if critical error
-    """
-    # Create data directory if it doesn't exist
     os.makedirs(GHARCHIVE_DATA_DIR, exist_ok=True)
-    # Generate URLs for last N days (hourly files: 0-23 for each day)
     end_date = datetime.now()
     start_date = end_date - timedelta(days=LEADERBOARD_TIME_FRAME_DAYS)
@@ -207,7 +179,6 @@ def download_all_gharchive_data():
     current_date = start_date
     while current_date <= end_date:
         date_str = current_date.strftime("%Y-%m-%d")
-        # Generate hourly URLs for this day (0-23)
         for hour in range(24):
             url = f"https://data.gharchive.org/{date_str}-{hour}.json.gz"
             urls.append(url)
@@ -217,10 +188,7 @@ def download_all_gharchive_data():
     try:
         with ThreadPoolExecutor(max_workers=DOWNLOAD_WORKERS) as executor:
-            # Submit all downloads
             futures = [executor.submit(download_file, url) for url in urls]
-            # Wait for downloads to complete
             for future in as_completed(futures):
                 downloads_processed += 1
@@ -235,25 +203,20 @@ def download_all_gharchive_data():
 # =============================================================================
-# HUGGINGFACE API WRAPPERS WITH ENHANCED BACKOFF
 # =============================================================================
 def is_retryable_error(e):
-    """
-    Check if exception is retryable (rate limit or timeout error).
-    """
-    # Check for rate limit error (429)
     if isinstance(e, HfHubHTTPError):
         if e.response.status_code == 429:
             return True
-    # Check for timeout errors
     if isinstance(e, (requests.exceptions.Timeout,
                      requests.exceptions.ReadTimeout,
                      requests.exceptions.ConnectTimeout)):
         return True
-    # Check if it's a timeout error wrapped in HfHubHTTPError
     if isinstance(e, Exception):
         error_str = str(e).lower()
         if 'timeout' in error_str or 'timed out' in error_str:
@@ -274,7 +237,7 @@ def is_retryable_error(e):
     )
 )
 def list_repo_files_with_backoff(api, **kwargs):
-    """Wrapper for api.list_repo_files() with exponential backoff for retryable errors."""
     return api.list_repo_files(**kwargs)
@@ -290,7 +253,7 @@ def list_repo_files_with_backoff(api, **kwargs):
     )
 )
 def hf_hub_download_with_backoff(**kwargs):
-    """Wrapper for hf_hub_download() with exponential backoff for retryable errors."""
     return hf_hub_download(**kwargs)
@@ -306,7 +269,7 @@ def hf_hub_download_with_backoff(**kwargs):
     )
 )
 def upload_file_with_backoff(api, **kwargs):
-    """Wrapper for api.upload_file() with exponential backoff for retryable errors."""
     return api.upload_file(**kwargs)
@@ -322,44 +285,30 @@ def upload_file_with_backoff(api, **kwargs):
     )
 )
 def upload_folder_with_backoff(api, **kwargs):
-    """Wrapper for api.upload_folder() with exponential backoff for retryable errors."""
     return api.upload_folder(**kwargs)
 def get_duckdb_connection():
     """
-    Initialize DuckDB connection with persistent database and optimized parallelization.
-    Returns:
-        DuckDB connection object
     """
-    # Use persistent database for caching results
     conn = duckdb.connect(DUCKDB_CACHE_FILE)
-    # Optimize for parallel processing with memory limits
-    conn.execute(f"SET threads TO {DUCKDB_THREADS};")  # Configure parallel threads
-    conn.execute("SET preserve_insertion_order = false;")  # Better parallelization
-    conn.execute("SET enable_object_cache = true;")  # Cache objects for reuse
-    conn.execute("SET temp_directory = '/tmp/duckdb_temp';")  # Use fast temp storage if needed
-    conn.execute(f"SET memory_limit = '{DUCKDB_MEMORY_LIMIT}';")  # Limit memory to prevent OOM crashes
-    conn.execute(f"SET max_memory = '{DUCKDB_MEMORY_LIMIT}';")  # Hard memory cap
     return conn
 def generate_file_path_patterns(start_date, end_date, data_dir=GHARCHIVE_DATA_DIR):
-    """
-    Generate file path patterns for GHArchive data in date range.
-    Only includes files that actually exist on disk.
-    Args:
-        start_date: Start datetime
-        end_date: End datetime
-        data_dir: Directory containing GHArchive data files
-    Returns:
-        List of file path patterns (hourly JSON.gz files) that exist
-    """
     file_patterns = []
     missing_dates = set()
@@ -367,40 +316,39 @@ def generate_file_path_patterns(start_date, end_date, data_dir=GHARCHIVE_DATA_DI
     end_day = end_date.replace(hour=0, minute=0, second=0, microsecond=0)
     while current_date <= end_day:
-        # Pattern for hourly JSON.gz files: 2024-11-15-{0..23}.json.gz
         date_has_files = False
         for hour in range(24):
             pattern = os.path.join(data_dir, f"{current_date.strftime('%Y-%m-%d')}-{hour}.json.gz")
-            # Only add pattern if file exists
             if os.path.exists(pattern):
                 file_patterns.append(pattern)
                 date_has_files = True
-        # Track missing dates
         if not date_has_files:
             missing_dates.add(current_date.strftime('%Y-%m-%d'))
-        # Move to next day
         current_date += timedelta(days=1)
-    # Print warning about missing dates
     if missing_dates:
-        print(f"   Warning: Skipping {len(missing_dates)} date(s) with no data files: {', '.join(sorted(missing_dates))}")
     return file_patterns
 # =============================================================================
-# DUCKDB QUERY FUNCTIONS
 # =============================================================================
-def fetch_all_pr_metadata_single_query(conn, identifiers, start_date, end_date):
     """
-    Fetch PR review metadata for ALL agents using ONE comprehensive DuckDB query.
-    This query combines:
-    1. Review events (PullRequestReviewEvent) for all agents
-    2. PR status (PullRequestEvent with action='closed')
     Args:
         conn: DuckDB connection instance
@@ -409,218 +357,128 @@ def fetch_all_pr_metadata_single_query(conn, identifiers, start_date, end_date):
         end_date: End datetime (timezone-aware)
     Returns:
-        Dictionary mapping agent identifier to list of PR metadata:
-        {
-            'agent-identifier': [
-                {
-                    'url': PR URL,
-                    'reviewed_at': Review timestamp,
-                    'merged_at': Merge timestamp (if merged, else None),
-                    'closed_at': Close timestamp (if closed, else None)
-                },
-                ...
-            ],
-            ...
-        }
     """
-    # Generate file path patterns for review period
-    review_patterns = generate_file_path_patterns(start_date, end_date)
-    # Generate file path patterns for PR status (use same lookback as reviews)
-    status_start_date = end_date - timedelta(days=LEADERBOARD_TIME_FRAME_DAYS)
-    status_patterns = generate_file_path_patterns(status_start_date, end_date)
-    # Build identifier list for IN clause
-    identifier_list = ', '.join([f"'{id}'" for id in identifiers])
-    # Convert file patterns to SQL array format for direct interpolation
-    review_patterns_sql = str(review_patterns).replace("'", "'")
-    status_patterns_sql = str(status_patterns).replace("'", "'")
-    # Build comprehensive query with CTEs using direct SQL array format (JSON.gz format)
-    # Optimized: Single file scan + ROW_NUMBER() deduplication (no DISTINCT)
-    query = f"""
-    WITH all_review_events AS (
-      -- Single file scan for all three event types (optimization: 3x I/O reduction)
-      SELECT
-        TRY_CAST(type AS VARCHAR) as event_type,
-        TRY_CAST(json_extract_string(actor, '$.login') AS VARCHAR) as reviewer,
-        TRY_CAST(json_extract_string(repo, '$.name') AS VARCHAR) as repo_name,
-        payload,
-        created_at
-      FROM read_json({review_patterns_sql}, union_by_name=true, filename=true, compression='gzip', format='newline_delimited', ignore_errors=true, maximum_object_size=2147483648)
-      WHERE
-        TRY_CAST(type AS VARCHAR) IN ('PullRequestReviewEvent', 'IssueCommentEvent', 'PullRequestReviewCommentEvent')
-        AND TRY_CAST(json_extract_string(actor, '$.login') AS VARCHAR) IN ({identifier_list})
-    ),
-    review_events AS (
-      -- Process events conditionally based on type
-      SELECT
-        CASE
-          WHEN event_type = 'IssueCommentEvent'
-            THEN TRY_CAST(json_extract_string(payload, '$.issue.html_url') AS VARCHAR)
-          ELSE TRY_CAST(json_extract_string(payload, '$.pull_request.html_url') AS VARCHAR)
-        END as url,
-        CASE
-          WHEN event_type = 'PullRequestReviewEvent'
-            THEN COALESCE(
-              TRY_CAST(json_extract_string(payload, '$.review.submitted_at') AS VARCHAR),
-              TRY_CAST(created_at AS VARCHAR)
-            )
-          ELSE TRY_CAST(created_at AS VARCHAR)
-        END as reviewed_at,
-        reviewer,
-        repo_name,
-        CASE
-          WHEN event_type = 'IssueCommentEvent'
-            THEN TRY_CAST(json_extract_string(payload, '$.issue.number') AS INTEGER)
-          ELSE TRY_CAST(json_extract_string(payload, '$.pull_request.number') AS INTEGER)
-        END as pr_number
-      FROM all_review_events
-      WHERE
-        -- Validate required fields per event type
-        (event_type = 'PullRequestReviewEvent' AND json_extract_string(payload, '$.pull_request.html_url') IS NOT NULL)
-        OR (event_type = 'IssueCommentEvent' AND json_extract_string(payload, '$.issue.pull_request.url') IS NOT NULL AND json_extract_string(payload, '$.issue.html_url') IS NOT NULL)
-        OR (event_type = 'PullRequestReviewCommentEvent' AND json_extract_string(payload, '$.pull_request.html_url') IS NOT NULL)
-    ),
-    pr_status AS (
-      -- Get merge/close status for those PRs
-      SELECT
-        TRY_CAST(json_extract_string(payload, '$.pull_request.html_url') AS VARCHAR) as url,
-        TRY_CAST(json_extract_string(payload, '$.pull_request.merged') AS BOOLEAN) as is_merged,
-        TRY_CAST(json_extract_string(payload, '$.pull_request.merged_at') AS VARCHAR) as merged_at,
-        TRY_CAST(json_extract_string(payload, '$.pull_request.closed_at') AS VARCHAR) as closed_at,
-        created_at,
-        ROW_NUMBER() OVER (PARTITION BY json_extract_string(payload, '$.pull_request.html_url') ORDER BY created_at DESC) as rn
-      FROM read_json({status_patterns_sql}, union_by_name=true, filename=true, compression='gzip', format='newline_delimited', ignore_errors=true, maximum_object_size=2147483648)
-      WHERE
-        type = 'PullRequestEvent'
-        AND TRY_CAST(json_extract_string(payload, '$.action') AS VARCHAR) = 'closed'
-        AND json_extract_string(payload, '$.pull_request.html_url') IS NOT NULL
-        AND json_extract_string(payload, '$.pull_request.html_url') IN (
-          SELECT DISTINCT url FROM review_events
-        )
-    ),
-    deduplicated_reviews AS (
-      -- Efficient deduplication using ROW_NUMBER() instead of DISTINCT (optimization: prevents massive hash table)
-      SELECT
-        re.reviewer,
-        re.url,
-        re.reviewed_at,
-        ps.merged_at,
-        ps.closed_at,
-        ROW_NUMBER() OVER (
-          PARTITION BY re.reviewer, re.url, re.reviewed_at
-          ORDER BY re.reviewed_at
-        ) as row_num
-      FROM review_events re
-      LEFT JOIN (SELECT * FROM pr_status WHERE rn = 1) ps ON re.url = ps.url
-    )
-    -- Return deduplicated results (row_num = 1 ensures uniqueness without DISTINCT)
-    SELECT
-      reviewer,
-      url,
-      reviewed_at,
-      merged_at,
-      closed_at
-    FROM deduplicated_reviews
-    WHERE row_num = 1
-    ORDER BY reviewer, reviewed_at DESC
-    """
-    try:
-        # Create cache table name based on date range
-        cache_table_name = f"pr_cache_{start_date.strftime('%Y%m%d')}_{end_date.strftime('%Y%m%d')}"
-        # Check if cache exists and is valid
-        cache_exists = conn.execute(f"""
-            SELECT COUNT(*) FROM information_schema.tables
-            WHERE table_name = '{cache_table_name}'
-        """).fetchone()[0] > 0
-        if cache_exists:
-            results = conn.execute(f"""
-                SELECT reviewer, url, reviewed_at, merged_at, closed_at
-                FROM {cache_table_name}
-                WHERE reviewer IN ({identifier_list})
-            """).fetchall()
-        else:
-            # Execute query using f-string interpolation
             results = conn.execute(query).fetchall()
-            # Cache the complete results for all future queries in this date range
-            if len(results) > 0:
-                conn.execute(f"""
-                    CREATE TABLE {cache_table_name} AS
-                    SELECT * FROM (
-                        SELECT UNNEST($1) as reviewer, UNNEST($2) as url,
-                               UNNEST($3) as reviewed_at, UNNEST($4) as merged_at,
-                               UNNEST($5) as closed_at
-                    )
-                """, [
-                    [r[0] for r in results],
-                    [r[1] for r in results],
-                    [r[2] for r in results],
-                    [r[3] for r in results],
-                    [r[4] for r in results]
-                ])
-        # Group results by agent with verification
-        metadata_by_agent = defaultdict(list)
-        unique_reviews = set()
-        duplicate_count = 0
-        for row in results:
-            reviewer = row[0]
-            url = row[1]
-            reviewed_at = normalize_date_format(row[2]) if row[2] else None
-            merged_at = normalize_date_format(row[3]) if row[3] else None
-            closed_at = normalize_date_format(row[4]) if row[4] else None
-            # Track unique review combinations for verification
-            review_key = (reviewer, url, reviewed_at)
-            if review_key in unique_reviews:
-                duplicate_count += 1
-            unique_reviews.add(review_key)
-            metadata_by_agent[reviewer].append({
-                'url': url,
-                'reviewed_at': reviewed_at,
-                'merged_at': merged_at,
-                'closed_at': closed_at,
-            })
-        # Verification: Ensure we have unique reviews (no duplicates from query)
-        total_reviews = len(results)
-        if duplicate_count > 0:
-            print(f"   Warning: Found {duplicate_count} duplicate review entries in query results!")
-            print(f"   Total: {total_reviews}, Unique: {len(unique_reviews)}")
-        else:
-            print(f"   Verification passed: {len(unique_reviews)} unique reviews retrieved (no duplicates)")
-        # Convert defaultdict to regular dict
-        return dict(metadata_by_agent)
-    except Exception as e:
-        print(f"DuckDB error: {str(e)}")
-        import traceback
-        traceback.print_exc()
-        return {}
 # =============================================================================
-# HUGGINGFACE STORAGE FUNCTIONS WITH BATCH UPLOAD
 # =============================================================================
 def group_metadata_by_date(metadata_list):
-    """
-    Group review metadata by date (year.month.day) for daily storage.
-    Returns dict: {(year, month, day): [metadata_list]}
-    """
     grouped = defaultdict(list)
     for review_meta in metadata_list:
@@ -639,21 +497,7 @@ def group_metadata_by_date(metadata_list):
 def upload_single_file_with_retry(api, local_path, repo_path, repo_id, repo_type, commit_message, max_retries=MAX_RETRIES):
-    """
-    Upload a single file with exponential backoff retry logic.
-    Args:
-        api: HfApi instance
-        local_path: Local file path
-        repo_path: Path in repository
-        repo_id: Repository ID
-        repo_type: Repository type (e.g., "dataset")
-        commit_message: Commit message
-        max_retries: Maximum number of retries
-    Returns:
-        bool: True if successful, False otherwise
-    """
     for attempt in range(max_retries):
         try:
             upload_file_with_backoff(
@@ -667,7 +511,6 @@ def upload_single_file_with_retry(api, local_path, repo_path, repo_id, repo_type
             return True
         except Exception as e:
             if attempt < max_retries - 1:
-                # Calculate exponential backoff
                 wait_time = min(UPLOAD_INITIAL_BACKOFF * (2 ** attempt), UPLOAD_MAX_BACKOFF)
                 print(f"      {e} error on attempt {attempt + 1}/{max_retries}. Retrying in {wait_time}s...")
                 time.sleep(wait_time)
@@ -678,16 +521,7 @@ def upload_single_file_with_retry(api, local_path, repo_path, repo_id, repo_type
 def batch_upload_review_metadata(all_metadata):
-    """
-    Upload review metadata for all agents with time gaps between uploads.
-    Each agent's data is uploaded as separate daily files with retry logic.
-    Args:
-        all_metadata: Dictionary mapping agent identifier to list of PR metadata
-    Returns:
-        tuple: (success_count, error_count)
-    """
     try:
         token = get_hf_token()
         if not token:
@@ -699,7 +533,6 @@ def batch_upload_review_metadata(all_metadata):
         error_count = 0
         total_files = 0
-        # First, calculate total number of files to upload
         for agent_identifier, metadata_list in all_metadata.items():
             if metadata_list:
                 grouped = group_metadata_by_date(metadata_list)
@@ -713,28 +546,21 @@ def batch_upload_review_metadata(all_metadata):
             if not metadata_list:
                 continue
-            # Group by date
             grouped = group_metadata_by_date(metadata_list)
-            # Create temporary files for this agent
             agent_temp_dir = tempfile.mkdtemp()
             try:
-                # Prepare all files locally
                 local_files = []
                 for (review_year, month, day), day_metadata in grouped.items():
                     filename = f"{review_year}.{month:02d}.{day:02d}.jsonl"
                     local_path = os.path.join(agent_temp_dir, filename)
                     repo_path = f"{agent_identifier}/{filename}"
-                    # Sort by reviewed_at for better organization
                     day_metadata.sort(key=lambda x: x.get('reviewed_at', ''), reverse=True)
-                    # Save to temp file
                     save_jsonl(local_path, day_metadata)
                     local_files.append((local_path, repo_path, len(day_metadata)))
-                # Upload each file with delay
                 agent_success = 0
                 agent_error = 0
@@ -756,12 +582,10 @@ def batch_upload_review_metadata(all_metadata):
                         agent_error += 1
                         error_count += 1
-                    # Add delay between uploads (except for last file)
                     if file_idx < len(local_files):
                         time.sleep(UPLOAD_DELAY_SECONDS)
             finally:
-                # Clean up temp directory
                 if os.path.exists(agent_temp_dir):
                     import shutil
                     shutil.rmtree(agent_temp_dir)
@@ -781,22 +605,14 @@ def batch_upload_review_metadata(all_metadata):
 def load_agents_from_hf():
-    """
-    Load all agent metadata JSON files from HuggingFace dataset.
-    The github_identifier is extracted from the filename (e.g., 'agent-name[bot].json' -> 'agent-name[bot]')
-    """
     try:
         api = HfApi()
         agents = []
-        # List all files in the repository
         files = list_repo_files_with_backoff(api=api, repo_id=AGENTS_REPO, repo_type="dataset")
-        # Filter for JSON files only
         json_files = [f for f in files if f.endswith('.json')]
-        # Download and parse each JSON file
         for json_file in json_files:
             try:
                 file_path = hf_hub_download_with_backoff(
@@ -808,11 +624,9 @@ def load_agents_from_hf():
                 with open(file_path, 'r') as f:
                     agent_data = json.load(f)
-                    # Only process agents with status == "public"
                     if agent_data.get('status') != 'public':
                         continue
-                    # Extract github_identifier from filename (remove .json extension)
                     github_identifier = json_file.replace('.json', '')
                     agent_data['github_identifier'] = github_identifier
@@ -823,7 +637,6 @@ def load_agents_from_hf():
                 continue
         print(f"Download complete: {len(agents)} agents")
         return agents
     except Exception as e:
@@ -831,13 +644,12 @@ def load_agents_from_hf():
         return []
-def get_pr_status_from_metadata(review_meta):
-    """
-    Derive PR status from merged_at and closed_at fields.
-    Returns:
-        str: 'merged', 'closed', or 'open'
-    """
     merged_at = review_meta.get('merged_at')
     closed_at = review_meta.get('closed_at')
@@ -850,23 +662,15 @@ def get_pr_status_from_metadata(review_meta):
 def calculate_review_stats_from_metadata(metadata_list):
-    """
-    Calculate statistics from a list of review metadata.
-    Returns:
-        Dictionary with review metrics (total_reviews, merged_prs, acceptance_rate, etc.)
-    """
     total_reviews = len(metadata_list)
-    # Count merged PRs
     merged_prs = sum(1 for review_meta in metadata_list
-                      if get_pr_status_from_metadata(review_meta) == 'merged')
-    # Count rejected PRs
     rejected_prs = sum(1 for review_meta in metadata_list
                       if get_pr_status_from_metadata(review_meta) == 'closed')
-    # Count pending PRs
     pending_prs = sum(1 for review_meta in metadata_list
                      if get_pr_status_from_metadata(review_meta) == 'open')
@@ -883,36 +687,14 @@ def calculate_review_stats_from_metadata(metadata_list):
 def calculate_monthly_metrics_by_agent(all_metadata_dict, agents):
-    """
-    Calculate monthly metrics for all agents for visualization.
-    Args:
-        all_metadata_dict: Dictionary mapping agent identifier to list of PR metadata
-        agents: List of agent dictionaries with metadata
-    Returns:
-        dict: {
-            'agents': list of agent names,
-            'months': list of month labels (e.g., '2025-01'),
-            'data': {
-                agent_name: {
-                    'acceptance_rates': list of acceptance rates by month,
-                    'total_reviews': list of review counts by month,
-                    'merged_prs': list of merged PR counts by month,
-                }
-            }
-        }
-    """
-    # Create mapping from agent_identifier to agent_name
     identifier_to_name = {agent.get('github_identifier'): agent.get('name') for agent in agents if agent.get('github_identifier')}
     if not all_metadata_dict:
         return {'agents': [], 'months': [], 'data': {}}
-    # Group by agent and month
     agent_month_data = defaultdict(lambda: defaultdict(list))
-    # Flatten the dict of lists into a single list with agent_identifier added
     for agent_identifier, metadata_list in all_metadata_dict.items():
         for review_meta in metadata_list:
             reviewed_at = review_meta.get('reviewed_at')
@@ -920,7 +702,6 @@ def calculate_monthly_metrics_by_agent(all_metadata_dict, agents):
             if not reviewed_at:
                 continue
-            # Get agent_name from identifier
             agent_name = identifier_to_name.get(agent_identifier, agent_identifier)
             try:
@@ -931,13 +712,11 @@ def calculate_monthly_metrics_by_agent(all_metadata_dict, agents):
                 print(f"Warning: Could not parse date '{reviewed_at}': {e}")
                 continue
-    # Get all unique months and sort them
     all_months = set()
     for agent_data in agent_month_data.values():
         all_months.update(agent_data.keys())
     months = sorted(list(all_months))
-    # Calculate metrics for each agent and month
     result_data = {}
     for agent_name, month_dict in agent_month_data.items():
         acceptance_rates = []
@@ -947,18 +726,14 @@ def calculate_monthly_metrics_by_agent(all_metadata_dict, agents):
         for month in months:
             reviews_in_month = month_dict.get(month, [])
-            # Count merged PRs
             merged_count = sum(1 for review in reviews_in_month
                                 if get_pr_status_from_metadata(review) == 'merged')
-            # Count rejected PRs
             rejected_count = sum(1 for review in reviews_in_month
                                 if get_pr_status_from_metadata(review) == 'closed')
-            # Total reviews
             total_count = len(reviews_in_month)
-            # Calculate acceptance rate (exclude pending PRs)
             completed_count = merged_count + rejected_count
             acceptance_rate = (merged_count / completed_count * 100) if completed_count > 0 else None
@@ -982,16 +757,7 @@ def calculate_monthly_metrics_by_agent(all_metadata_dict, agents):
 def construct_leaderboard_from_metadata(all_metadata_dict, agents):
-    """
-    Construct leaderboard from in-memory review metadata.
-    Args:
-        all_metadata_dict: Dictionary mapping agent identifier to list of PR metadata
-        agents: List of agent dictionaries with metadata
-    Returns:
-        Dictionary of agent stats.
-    """
     if not agents:
         print("Error: No agents found")
         return {}
@@ -1002,10 +768,7 @@ def construct_leaderboard_from_metadata(all_metadata_dict, agents):
         identifier = agent.get('github_identifier')
         agent_name = agent.get('name', 'Unknown')
-        # Get metadata for this agent from the dictionary
         bot_metadata = all_metadata_dict.get(identifier, [])
-        # Calculate stats
         stats = calculate_review_stats_from_metadata(bot_metadata)
         cache_dict[identifier] = {
@@ -1019,16 +782,7 @@ def construct_leaderboard_from_metadata(all_metadata_dict, agents):
 def save_leaderboard_data_to_hf(leaderboard_dict, monthly_metrics):
-    """
-    Save leaderboard data and monthly metrics to HuggingFace dataset as swe-review.json.
-    Args:
-        leaderboard_dict: Dictionary of agent stats from construct_leaderboard_from_metadata()
-        monthly_metrics: Monthly metrics data from calculate_monthly_metrics_by_agent()
-    Returns:
-        bool: True if successful, False otherwise
-    """
     try:
         token = get_hf_token()
         if not token:
@@ -1037,7 +791,6 @@ def save_leaderboard_data_to_hf(leaderboard_dict, monthly_metrics):
         api = HfApi(token=token)
         filename = "swe-review.json"
-        # Combine leaderboard and monthly metrics
         combined_data = {
             'last_updated': datetime.now(timezone.utc).isoformat(),
             'leaderboard': leaderboard_dict,
@@ -1047,12 +800,10 @@ def save_leaderboard_data_to_hf(leaderboard_dict, monthly_metrics):
             }
         }
-        # Save locally first
         with open(filename, 'w') as f:
             json.dump(combined_data, f, indent=2)
         try:
-            # Upload to HuggingFace with retry logic
             upload_file_with_backoff(
                 api=api,
                 path_or_fileobj=filename,
@@ -1062,7 +813,6 @@ def save_leaderboard_data_to_hf(leaderboard_dict, monthly_metrics):
             )
             return True
         finally:
-            # Always clean up local file
             if os.path.exists(filename):
                 os.remove(filename)
@@ -1074,21 +824,19 @@ def save_leaderboard_data_to_hf(leaderboard_dict, monthly_metrics):
 # =============================================================================
-# MAIN MINING FUNCTION
 # =============================================================================
 def mine_all_agents():
     """
-    Mine review metadata for all agents within LEADERBOARD_TIME_FRAME_DAYS and save to HuggingFace.
-    Downloads GHArchive data first, then uses ONE DuckDB query for ALL agents, then batch uploads with time gaps.
     """
-    # Step 1: Download GHArchive data
     print(f"\n[1/5] Downloading GHArchive data...")
     if not download_all_gharchive_data():
         print("Warning: Download had errors, continuing with available data...")
-    # Step 2: Load agent metadata from HuggingFace
     print(f"\n[2/5] Loading agent metadata...")
     agents = load_agents_from_hf()
@@ -1096,7 +844,6 @@ def mine_all_agents():
         print("Error: No agents found")
         return
-    # Extract all identifiers
     identifiers = [agent['github_identifier'] for agent in agents if agent.get('github_identifier')]
     if not identifiers:
         print("Error: No valid agent identifiers found")
@@ -1104,55 +851,42 @@ def mine_all_agents():
     print(f"\n[3/5] Mining review metadata ({len(identifiers)} agents, {LEADERBOARD_TIME_FRAME_DAYS} days)...")
-    # Initialize DuckDB connection
     try:
         conn = get_duckdb_connection()
     except Exception as e:
         print(f"Failed to initialize DuckDB connection: {str(e)}")
         return
-    # Define time range: past LEADERBOARD_TIME_FRAME_DAYS (excluding today)
     current_time = datetime.now(timezone.utc)
     end_date = current_time.replace(hour=0, minute=0, second=0, microsecond=0)
     start_date = end_date - timedelta(days=LEADERBOARD_TIME_FRAME_DAYS)
     try:
-        # Use single query for all agents
-        all_metadata = fetch_all_pr_metadata_single_query(
             conn, identifiers, start_date, end_date
         )
-        # Calculate summary statistics
-        total_prs = sum(len(metadata_list) for metadata_list in all_metadata.values())
         agents_with_data = sum(1 for metadata_list in all_metadata.values() if metadata_list)
-        print(f"Query complete: {total_prs} PRs found for {agents_with_data}/{len(agents)} agents")
     except Exception as e:
         print(f"Error during DuckDB fetch: {str(e)}")
         import traceback
         traceback.print_exc()
         return
     finally:
-        # Close DuckDB connection
         conn.close()
-    # Step 4: Batch upload review metadata with time gaps
     print(f"\n[4/5] Uploading review metadata...")
     success_count, error_count = batch_upload_review_metadata(all_metadata)
-    # Step 5: Construct and save leaderboard data
     print(f"\n[5/5] Saving leaderboard...")
     try:
-        # Construct leaderboard from in-memory data
         leaderboard_dict = construct_leaderboard_from_metadata(all_metadata, agents)
-        # Calculate monthly metrics from in-memory data
         monthly_metrics = calculate_monthly_metrics_by_agent(all_metadata, agents)
-        # Save to HuggingFace
         save_leaderboard_data_to_hf(leaderboard_dict, monthly_metrics)
         print(f"\nCOMPLETE: {success_count} files uploaded" + (f", {error_count} errors" if error_count > 0 else ""))
@@ -1168,30 +902,16 @@ def mine_all_agents():
 # =============================================================================
 def setup_scheduler():
-    """
-    Set up APScheduler to run mining jobs periodically.
-    Schedule is configurable via environment variables.
-    Environment variables:
-    - SCHEDULE_ENABLED: Enable/disable scheduler (default: true)
-    - SCHEDULE_DAY_OF_MONTH: Day of month to run (default: 22, fourth week)
-    - SCHEDULE_HOUR: Hour to run (0-23, default: 0)
-    - SCHEDULE_MINUTE: Minute to run (0-59, default: 0)
-    - SCHEDULE_TIMEZONE: Timezone for scheduling (default: UTC)
-    """
-    # Configure logging for APScheduler
     logging.basicConfig(
         level=logging.INFO,
         format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
     )
-    # Disable verbose HTTP request logging from httpx (used by huggingface_hub)
     logging.getLogger('httpx').setLevel(logging.WARNING)
-    # Create scheduler
     scheduler = BlockingScheduler(timezone=SCHEDULE_TIMEZONE)
-    # Create cron trigger with configured schedule (monthly on specific day)
     trigger = CronTrigger(
         day=SCHEDULE_DAY_OF_MONTH,
         hour=SCHEDULE_HOUR,
@@ -1199,7 +919,6 @@ def setup_scheduler():
         timezone=SCHEDULE_TIMEZONE
     )
-    # Add job to scheduler
     scheduler.add_job(
         mine_all_agents,
         trigger=trigger,
@@ -1208,13 +927,11 @@ def setup_scheduler():
         replace_existing=True
     )
-    # Print schedule information
     from datetime import datetime
     next_run = trigger.get_next_fire_time(None, datetime.now(trigger.timezone))
     print(f"Scheduler: Monthly on day {SCHEDULE_DAY_OF_MONTH} at {SCHEDULE_HOUR:02d}:{SCHEDULE_MINUTE:02d} {SCHEDULE_TIMEZONE}")
     print(f"Next run: {next_run}\n")
-    # Start scheduler (blocking call)
     print(f"\nScheduler started")
     scheduler.start()
@@ -1225,8 +942,6 @@ def setup_scheduler():
 if __name__ == "__main__":
     if SCHEDULE_ENABLED:
-        # Run with scheduler
         setup_scheduler()
     else:
-        # Run without scheduler, just mine once
         mine_all_agents()

 AGENTS_REPO = "SWE-Arena/bot_metadata"
 REVIEW_METADATA_REPO = "SWE-Arena/review_metadata"
+LEADERBOARD_REPO = "SWE-Arena/leaderboard_metadata"
+LEADERBOARD_TIME_FRAME_DAYS = 180
+GHARCHIVE_DATA_DIR = "../gharchive/data"
+DUCKDB_CACHE_FILE = "cache.duckdb"
+# OPTIMIZED DUCKDB CONFIGURATION
+DUCKDB_THREADS = 8
+DUCKDB_MEMORY_LIMIT = "64GB"
+# Streaming batch configuration
+BATCH_SIZE_DAYS = 7  # Process 1 week at a time (~168 hourly files)
+# At this size: ~7 days × 24 files × ~100MB per file = ~16GB uncompressed per batch
 # Download configuration
+DOWNLOAD_WORKERS = 4
+DOWNLOAD_RETRY_DELAY = 2
+MAX_RETRIES = 5
 # Upload configuration
+UPLOAD_DELAY_SECONDS = 5
+UPLOAD_INITIAL_BACKOFF = 60
+UPLOAD_MAX_BACKOFF = 3600
 # Scheduler configuration
+SCHEDULE_ENABLED = False
+SCHEDULE_DAY_OF_MONTH = 22
+SCHEDULE_HOUR = 0
+SCHEDULE_MINUTE = 0
+SCHEDULE_TIMEZONE = 'UTC'
 # =============================================================================
 # UTILITY FUNCTIONS
 def normalize_date_format(date_string):
+    """Convert date strings or datetime objects to standardized ISO 8601 format with Z suffix."""
     if not date_string or date_string == 'N/A':
         return 'N/A'
     try:
         import re
+        if isinstance(date_string, datetime):
+            return date_string.strftime('%Y-%m-%dT%H:%M:%SZ')
+        date_string = re.sub(r'\s+', ' ', date_string.strip())
         date_string = date_string.replace(' ', 'T')
         if len(date_string) >= 3:
             if date_string[-3:-2] in ('+', '-') and ':' not in date_string[-3:]:
                 date_string = date_string + ':00'
         dt = datetime.fromisoformat(date_string.replace('Z', '+00:00'))
         return dt.strftime('%Y-%m-%dT%H:%M:%SZ')
     except Exception as e:
         print(f"Warning: Could not parse date '{date_string}': {e}")
 # =============================================================================
 def download_file(url):
+    """Download a GHArchive file with retry logic."""
     filename = url.split("/")[-1]
     filepath = os.path.join(GHARCHIVE_DATA_DIR, filename)
     if os.path.exists(filepath):
         return True
     for attempt in range(MAX_RETRIES):
         try:
             response = requests.get(url, timeout=30)
         except requests.exceptions.HTTPError as e:
             if e.response.status_code == 404:
                 return False
             else:
                 if attempt < MAX_RETRIES - 1:
+                    wait_time = DOWNLOAD_RETRY_DELAY * (2 ** attempt)
                     print(f"   ⚠ {filename}: HTTP error {e.response.status_code}, retrying in {wait_time}s (attempt {attempt + 1}/{MAX_RETRIES})")
                     time.sleep(wait_time)
                 else:
         except (requests.exceptions.Timeout,
                 requests.exceptions.ConnectionError,
                 requests.exceptions.ReadTimeout) as e:
             if attempt < MAX_RETRIES - 1:
+                wait_time = DOWNLOAD_RETRY_DELAY * (2 ** attempt)
                 print(f"   ⚠ {filename}: {type(e).__name__}, retrying in {wait_time}s (attempt {attempt + 1}/{MAX_RETRIES})")
                 time.sleep(wait_time)
             else:
                 print(f"   ✗ {filename}: Failed after {MAX_RETRIES} attempts - {type(e).__name__}")
         except Exception as e:
             if attempt < MAX_RETRIES - 1:
                 wait_time = DOWNLOAD_RETRY_DELAY * (2 ** attempt)
                 print(f"   ⚠ {filename}: {e}, retrying in {wait_time}s (attempt {attempt + 1}/{MAX_RETRIES})")
 def download_all_gharchive_data():
+    """Download all GHArchive data files for the last LEADERBOARD_TIME_FRAME_DAYS."""
     os.makedirs(GHARCHIVE_DATA_DIR, exist_ok=True)
     end_date = datetime.now()
     start_date = end_date - timedelta(days=LEADERBOARD_TIME_FRAME_DAYS)
     current_date = start_date
     while current_date <= end_date:
         date_str = current_date.strftime("%Y-%m-%d")
         for hour in range(24):
             url = f"https://data.gharchive.org/{date_str}-{hour}.json.gz"
             urls.append(url)
     try:
         with ThreadPoolExecutor(max_workers=DOWNLOAD_WORKERS) as executor:
             futures = [executor.submit(download_file, url) for url in urls]
             for future in as_completed(futures):
                 downloads_processed += 1
 # =============================================================================
+# HUGGINGFACE API WRAPPERS
 # =============================================================================
 def is_retryable_error(e):
+    """Check if exception is retryable (rate limit or timeout error)."""
     if isinstance(e, HfHubHTTPError):
         if e.response.status_code == 429:
             return True
     if isinstance(e, (requests.exceptions.Timeout,
                      requests.exceptions.ReadTimeout,
                      requests.exceptions.ConnectTimeout)):
         return True
     if isinstance(e, Exception):
         error_str = str(e).lower()
         if 'timeout' in error_str or 'timed out' in error_str:
     )
 )
 def list_repo_files_with_backoff(api, **kwargs):
+    """Wrapper for api.list_repo_files() with exponential backoff."""
     return api.list_repo_files(**kwargs)
     )
 )
 def hf_hub_download_with_backoff(**kwargs):
+    """Wrapper for hf_hub_download() with exponential backoff."""
     return hf_hub_download(**kwargs)
     )
 )
 def upload_file_with_backoff(api, **kwargs):
+    """Wrapper for api.upload_file() with exponential backoff."""
     return api.upload_file(**kwargs)
     )
 )
 def upload_folder_with_backoff(api, **kwargs):
+    """Wrapper for api.upload_folder() with exponential backoff."""
     return api.upload_folder(**kwargs)
 def get_duckdb_connection():
     """
+    Initialize DuckDB connection with OPTIMIZED memory settings.
+    Uses persistent database and reduced memory footprint.
     """
     conn = duckdb.connect(DUCKDB_CACHE_FILE)
+    # OPTIMIZED SETTINGS
+    conn.execute(f"SET threads TO {DUCKDB_THREADS};")
+    conn.execute("SET preserve_insertion_order = false;")
+    conn.execute("SET enable_object_cache = true;")
+    conn.execute("SET temp_directory = '/tmp/duckdb_temp';")
+    conn.execute(f"SET memory_limit = '{DUCKDB_MEMORY_LIMIT}';")
+    conn.execute(f"SET max_memory = '{DUCKDB_MEMORY_LIMIT}';")
     return conn
 def generate_file_path_patterns(start_date, end_date, data_dir=GHARCHIVE_DATA_DIR):
+    """Generate file path patterns for GHArchive data in date range (only existing files)."""
     file_patterns = []
     missing_dates = set()
     end_day = end_date.replace(hour=0, minute=0, second=0, microsecond=0)
     while current_date <= end_day:
         date_has_files = False
         for hour in range(24):
             pattern = os.path.join(data_dir, f"{current_date.strftime('%Y-%m-%d')}-{hour}.json.gz")
             if os.path.exists(pattern):
                 file_patterns.append(pattern)
                 date_has_files = True
         if not date_has_files:
             missing_dates.add(current_date.strftime('%Y-%m-%d'))
         current_date += timedelta(days=1)
     if missing_dates:
+        print(f"   ⚠ Skipping {len(missing_dates)} date(s) with no data")
     return file_patterns
 # =============================================================================
+# STREAMING BATCH PROCESSING FOR REVIEW METADATA
 # =============================================================================
+def fetch_all_review_metadata_streaming(conn, identifiers, start_date, end_date):
     """
+    OPTIMIZED: Fetch review metadata using streaming batch processing.
+    Processes GHArchive files in BATCH_SIZE_DAYS chunks to limit memory usage.
+    Instead of loading 180 days (4,344 files) at once, processes 7 days at a time.
+    This prevents OOM errors by:
+    1. Only keeping ~168 hourly files in memory per batch (vs 4,344)
+    2. Incrementally building the results dictionary
+    3. Allowing DuckDB to garbage collect after each batch
     Args:
         conn: DuckDB connection instance
         end_date: End datetime (timezone-aware)
     Returns:
+        Dictionary mapping agent identifier to list of review metadata
     """
+    identifier_list = ', '.join([f"'{id}'" for id in identifiers])
+    metadata_by_agent = defaultdict(list)
+    # Calculate total batches
+    total_days = (end_date - start_date).days
+    total_batches = (total_days // BATCH_SIZE_DAYS) + 1
+    # Process in configurable batches
+    current_date = start_date
+    batch_num = 0
+    total_reviews = 0
+    print(f"   Streaming {total_batches} batches of {BATCH_SIZE_DAYS}-day intervals...")
+    while current_date <= end_date:
+        batch_num += 1
+        batch_end = min(current_date + timedelta(days=BATCH_SIZE_DAYS - 1), end_date)
+        # Get file patterns for THIS BATCH ONLY
+        file_patterns = generate_file_path_patterns(current_date, batch_end)
+        if not file_patterns:
+            print(f"   Batch {batch_num}/{total_batches}: {current_date.date()} to {batch_end.date()} - NO DATA")
+            current_date = batch_end + timedelta(days=1)
+            continue
+        # Progress indicator
+        print(f"   Batch {batch_num}/{total_batches}: {current_date.date()} to {batch_end.date()} ({len(file_patterns)} files)... ", end="", flush=True)
+        # Build file patterns SQL for THIS BATCH
+        file_patterns_sql = '[' + ', '.join([f"'{fp}'" for fp in file_patterns]) + ']'
+        # SIMPLIFIED query for review metadata
+        # Focuses on PullRequestReviewEvent and tracks PR status
+        query = f"""
+        WITH review_events AS (
+            SELECT
+                payload.pull_request.html_url as pr_url,
+                actor.login as reviewer,
+                COALESCE(payload.review.submitted_at, created_at) as reviewed_at
+            FROM read_json({file_patterns_sql}, union_by_name=true, filename=true, compression='gzip', format='newline_delimited', ignore_errors=true, maximum_object_size=2147483648)
+            WHERE
+                type = 'PullRequestReviewEvent'
+                AND payload.pull_request.html_url IS NOT NULL
+                AND actor.login IN ({identifier_list})
+        ),
+        pr_status AS (
+            SELECT
+                payload.pull_request.html_url as pr_url,
+                payload.pull_request.merged as is_merged,
+                payload.pull_request.merged_at as merged_at,
+                payload.pull_request.closed_at as closed_at,
+                ROW_NUMBER() OVER (PARTITION BY payload.pull_request.html_url ORDER BY created_at DESC) as rn
+            FROM read_json({file_patterns_sql}, union_by_name=true, filename=true, compression='gzip', format='newline_delimited', ignore_errors=true, maximum_object_size=2147483648)
+            WHERE
+                type = 'PullRequestEvent'
+                AND payload.action = 'closed'
+                AND payload.pull_request.html_url IS NOT NULL
+                AND payload.pull_request.html_url IN (SELECT DISTINCT pr_url FROM review_events)
+        )
+        SELECT
+            re.reviewer,
+            re.pr_url as url,
+            re.reviewed_at,
+            ps.merged_at,
+            ps.closed_at
+        FROM review_events re
+        LEFT JOIN (SELECT * FROM pr_status WHERE rn = 1) ps ON re.pr_url = ps.pr_url
+        ORDER BY re.reviewer, re.reviewed_at DESC
+        """
+        try:
             results = conn.execute(query).fetchall()
+            batch_reviews = 0
+            # Add results to accumulating dictionary
+            for row in results:
+                reviewer = row[0]
+                url = row[1]
+                reviewed_at = normalize_date_format(row[2]) if row[2] else None
+                merged_at = normalize_date_format(row[3]) if row[3] else None
+                closed_at = normalize_date_format(row[4]) if row[4] else None
+                if not url or not reviewed_at:
+                    continue
+                review_metadata = {
+                    'url': url,
+                    'reviewed_at': reviewed_at,
+                    'merged_at': merged_at,
+                    'closed_at': closed_at,
+                }
+                metadata_by_agent[reviewer].append(review_metadata)
+                batch_reviews += 1
+                total_reviews += 1
+            print(f"✓ {batch_reviews} reviews found")
+        except Exception as e:
+            print(f"\n   ✗ Batch {batch_num} error: {str(e)}")
+            import traceback
+            traceback.print_exc()
+        # Move to next batch
+        current_date = batch_end + timedelta(days=1)
+    # Final summary
+    agents_with_data = sum(1 for reviews in metadata_by_agent.values() if reviews)
+    print(f"\n   ✓ Complete: {total_reviews} reviews found for {agents_with_data}/{len(identifiers)} agents")
+    return dict(metadata_by_agent)
 # =============================================================================
+# HUGGINGFACE STORAGE FUNCTIONS
 # =============================================================================
 def group_metadata_by_date(metadata_list):
+    """Group review metadata by date for daily storage."""
     grouped = defaultdict(list)
     for review_meta in metadata_list:
 def upload_single_file_with_retry(api, local_path, repo_path, repo_id, repo_type, commit_message, max_retries=MAX_RETRIES):
+    """Upload a single file with exponential backoff retry logic."""
     for attempt in range(max_retries):
         try:
             upload_file_with_backoff(
             return True
         except Exception as e:
             if attempt < max_retries - 1:
                 wait_time = min(UPLOAD_INITIAL_BACKOFF * (2 ** attempt), UPLOAD_MAX_BACKOFF)
                 print(f"      {e} error on attempt {attempt + 1}/{max_retries}. Retrying in {wait_time}s...")
                 time.sleep(wait_time)
 def batch_upload_review_metadata(all_metadata):
+    """Upload review metadata for all agents with time gaps between uploads."""
     try:
         token = get_hf_token()
         if not token:
         error_count = 0
         total_files = 0
         for agent_identifier, metadata_list in all_metadata.items():
             if metadata_list:
                 grouped = group_metadata_by_date(metadata_list)
             if not metadata_list:
                 continue
             grouped = group_metadata_by_date(metadata_list)
             agent_temp_dir = tempfile.mkdtemp()
             try:
                 local_files = []
                 for (review_year, month, day), day_metadata in grouped.items():
                     filename = f"{review_year}.{month:02d}.{day:02d}.jsonl"
                     local_path = os.path.join(agent_temp_dir, filename)
                     repo_path = f"{agent_identifier}/{filename}"
                     day_metadata.sort(key=lambda x: x.get('reviewed_at', ''), reverse=True)
                     save_jsonl(local_path, day_metadata)
                     local_files.append((local_path, repo_path, len(day_metadata)))
                 agent_success = 0
                 agent_error = 0
                         agent_error += 1
                         error_count += 1
                     if file_idx < len(local_files):
                         time.sleep(UPLOAD_DELAY_SECONDS)
             finally:
                 if os.path.exists(agent_temp_dir):
                     import shutil
                     shutil.rmtree(agent_temp_dir)
 def load_agents_from_hf():
+    """Load all agent metadata JSON files from HuggingFace dataset."""
     try:
         api = HfApi()
         agents = []
         files = list_repo_files_with_backoff(api=api, repo_id=AGENTS_REPO, repo_type="dataset")
         json_files = [f for f in files if f.endswith('.json')]
         for json_file in json_files:
             try:
                 file_path = hf_hub_download_with_backoff(
                 with open(file_path, 'r') as f:
                     agent_data = json.load(f)
                     if agent_data.get('status') != 'public':
                         continue
                     github_identifier = json_file.replace('.json', '')
                     agent_data['github_identifier'] = github_identifier
                 continue
         print(f"Download complete: {len(agents)} agents")
         return agents
     except Exception as e:
         return []
+# =============================================================================
+# STATISTICS CALCULATION
+# =============================================================================
+def get_pr_status_from_metadata(review_meta):
+    """Derive PR status from merged_at and closed_at fields."""
     merged_at = review_meta.get('merged_at')
     closed_at = review_meta.get('closed_at')
 def calculate_review_stats_from_metadata(metadata_list):
+    """Calculate statistics from a list of review metadata."""
     total_reviews = len(metadata_list)
     merged_prs = sum(1 for review_meta in metadata_list
+                     if get_pr_status_from_metadata(review_meta) == 'merged')
     rejected_prs = sum(1 for review_meta in metadata_list
                       if get_pr_status_from_metadata(review_meta) == 'closed')
     pending_prs = sum(1 for review_meta in metadata_list
                      if get_pr_status_from_metadata(review_meta) == 'open')
 def calculate_monthly_metrics_by_agent(all_metadata_dict, agents):
+    """Calculate monthly metrics for all agents for visualization."""
     identifier_to_name = {agent.get('github_identifier'): agent.get('name') for agent in agents if agent.get('github_identifier')}
     if not all_metadata_dict:
         return {'agents': [], 'months': [], 'data': {}}
     agent_month_data = defaultdict(lambda: defaultdict(list))
     for agent_identifier, metadata_list in all_metadata_dict.items():
         for review_meta in metadata_list:
             reviewed_at = review_meta.get('reviewed_at')
             if not reviewed_at:
                 continue
             agent_name = identifier_to_name.get(agent_identifier, agent_identifier)
             try:
                 print(f"Warning: Could not parse date '{reviewed_at}': {e}")
                 continue
     all_months = set()
     for agent_data in agent_month_data.values():
         all_months.update(agent_data.keys())
     months = sorted(list(all_months))
     result_data = {}
     for agent_name, month_dict in agent_month_data.items():
         acceptance_rates = []
         for month in months:
             reviews_in_month = month_dict.get(month, [])
             merged_count = sum(1 for review in reviews_in_month
                                 if get_pr_status_from_metadata(review) == 'merged')
             rejected_count = sum(1 for review in reviews_in_month
                                 if get_pr_status_from_metadata(review) == 'closed')
             total_count = len(reviews_in_month)
             completed_count = merged_count + rejected_count
             acceptance_rate = (merged_count / completed_count * 100) if completed_count > 0 else None
 def construct_leaderboard_from_metadata(all_metadata_dict, agents):
+    """Construct leaderboard from in-memory review metadata."""
     if not agents:
         print("Error: No agents found")
         return {}
         identifier = agent.get('github_identifier')
         agent_name = agent.get('name', 'Unknown')
         bot_metadata = all_metadata_dict.get(identifier, [])
         stats = calculate_review_stats_from_metadata(bot_metadata)
         cache_dict[identifier] = {
 def save_leaderboard_data_to_hf(leaderboard_dict, monthly_metrics):
+    """Save leaderboard data and monthly metrics to HuggingFace dataset."""
     try:
         token = get_hf_token()
         if not token:
         api = HfApi(token=token)
         filename = "swe-review.json"
         combined_data = {
             'last_updated': datetime.now(timezone.utc).isoformat(),
             'leaderboard': leaderboard_dict,
             }
         }
         with open(filename, 'w') as f:
             json.dump(combined_data, f, indent=2)
         try:
             upload_file_with_backoff(
                 api=api,
                 path_or_fileobj=filename,
             )
             return True
         finally:
             if os.path.exists(filename):
                 os.remove(filename)
 # =============================================================================
+# MINING FUNCTION
 # =============================================================================
 def mine_all_agents():
     """
+    Mine review metadata for all agents using STREAMING batch processing.
+    Downloads GHArchive data, then uses BATCH-based DuckDB queries.
     """
     print(f"\n[1/5] Downloading GHArchive data...")
     if not download_all_gharchive_data():
         print("Warning: Download had errors, continuing with available data...")
     print(f"\n[2/5] Loading agent metadata...")
     agents = load_agents_from_hf()
         print("Error: No agents found")
         return
     identifiers = [agent['github_identifier'] for agent in agents if agent.get('github_identifier')]
     if not identifiers:
         print("Error: No valid agent identifiers found")
     print(f"\n[3/5] Mining review metadata ({len(identifiers)} agents, {LEADERBOARD_TIME_FRAME_DAYS} days)...")
     try:
         conn = get_duckdb_connection()
     except Exception as e:
         print(f"Failed to initialize DuckDB connection: {str(e)}")
         return
     current_time = datetime.now(timezone.utc)
     end_date = current_time.replace(hour=0, minute=0, second=0, microsecond=0)
     start_date = end_date - timedelta(days=LEADERBOARD_TIME_FRAME_DAYS)
     try:
+        # USE STREAMING FUNCTION
+        all_metadata = fetch_all_review_metadata_streaming(
             conn, identifiers, start_date, end_date
         )
+        total_reviews = sum(len(metadata_list) for metadata_list in all_metadata.values())
         agents_with_data = sum(1 for metadata_list in all_metadata.values() if metadata_list)
     except Exception as e:
         print(f"Error during DuckDB fetch: {str(e)}")
         import traceback
         traceback.print_exc()
         return
     finally:
         conn.close()
     print(f"\n[4/5] Uploading review metadata...")
     success_count, error_count = batch_upload_review_metadata(all_metadata)
     print(f"\n[5/5] Saving leaderboard...")
     try:
         leaderboard_dict = construct_leaderboard_from_metadata(all_metadata, agents)
         monthly_metrics = calculate_monthly_metrics_by_agent(all_metadata, agents)
         save_leaderboard_data_to_hf(leaderboard_dict, monthly_metrics)
         print(f"\nCOMPLETE: {success_count} files uploaded" + (f", {error_count} errors" if error_count > 0 else ""))
 # =============================================================================
 def setup_scheduler():
+    """Set up APScheduler to run mining jobs periodically."""
     logging.basicConfig(
         level=logging.INFO,
         format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
     )
     logging.getLogger('httpx').setLevel(logging.WARNING)
     scheduler = BlockingScheduler(timezone=SCHEDULE_TIMEZONE)
     trigger = CronTrigger(
         day=SCHEDULE_DAY_OF_MONTH,
         hour=SCHEDULE_HOUR,
         timezone=SCHEDULE_TIMEZONE
     )
     scheduler.add_job(
         mine_all_agents,
         trigger=trigger,
         replace_existing=True
     )
     from datetime import datetime
     next_run = trigger.get_next_fire_time(None, datetime.now(trigger.timezone))
     print(f"Scheduler: Monthly on day {SCHEDULE_DAY_OF_MONTH} at {SCHEDULE_HOUR:02d}:{SCHEDULE_MINUTE:02d} {SCHEDULE_TIMEZONE}")
     print(f"Next run: {next_run}\n")
     print(f"\nScheduler started")
     scheduler.start()
 if __name__ == "__main__":
     if SCHEDULE_ENABLED:
         setup_scheduler()
     else:
         mine_all_agents()