zhimin-z commited on
Commit
a15931a
·
1 Parent(s): e7d88ff
Files changed (7) hide show
  1. .gitignore +1 -1
  2. Dockerfile +6 -18
  3. README.md +1 -1
  4. app.py +292 -1141
  5. docker-compose.yml +21 -0
  6. msr.py +635 -475
  7. requirements.txt +3 -5
.gitignore CHANGED
@@ -2,4 +2,4 @@
2
  *.env
3
  *.venv
4
  *.ipynb
5
- *.pyc
 
2
  *.env
3
  *.venv
4
  *.ipynb
5
+ *.pyc
Dockerfile CHANGED
@@ -1,34 +1,22 @@
1
- # Use official Python runtime as base image
2
  FROM python:3.12-slim
3
 
4
  # Set working directory
5
  WORKDIR /app
6
 
7
- # Install system dependencies (if needed)
8
  RUN apt-get update && apt-get install -y \
9
- git \
 
10
  && rm -rf /var/lib/apt/lists/*
11
 
12
- # Copy requirements.txt
13
  COPY requirements.txt .
14
 
15
  # Install Python dependencies
16
  RUN pip install --no-cache-dir -r requirements.txt
17
 
18
- # Copy application files
19
- COPY .env .
20
- COPY msr.py .
21
-
22
- # Create a non-root user for security (optional but recommended)
23
- RUN useradd -m -u 1000 appuser && chown -R appuser:appuser /app
24
- USER appuser
25
-
26
- # Expose port for Gradio web interface (default is 7860)
27
- EXPOSE 7860
28
-
29
  # Set environment variables
30
- ENV GRADIO_SERVER_NAME=0.0.0.0
31
- ENV GRADIO_SERVER_PORT=7860
32
 
33
- # Run the Gradio app
34
  CMD ["python", "msr.py"]
 
 
1
  FROM python:3.12-slim
2
 
3
  # Set working directory
4
  WORKDIR /app
5
 
6
+ # Install system dependencies
7
  RUN apt-get update && apt-get install -y \
8
+ gcc \
9
+ g++ \
10
  && rm -rf /var/lib/apt/lists/*
11
 
12
+ # Copy requirements file
13
  COPY requirements.txt .
14
 
15
  # Install Python dependencies
16
  RUN pip install --no-cache-dir -r requirements.txt
17
 
 
 
 
 
 
 
 
 
 
 
 
18
  # Set environment variables
19
+ ENV PYTHONUNBUFFERED=1
 
20
 
21
+ # Run the mining script with scheduler
22
  CMD ["python", "msr.py"]
README.md CHANGED
@@ -57,7 +57,7 @@ We search GitHub using multiple query patterns to catch all PRs associated with
57
  - Co-authored commits (`co-authored-by:`)
58
 
59
  **Regular Updates**
60
- The leaderboard refreshes automatically every day at 12:00 AM UTC.
61
 
62
  **Community Submissions**
63
  Anyone can submit a coding agent to track via the leaderboard. We store agent metadata in Hugging Face datasets (`SWE-Arena/bot_metadata`) and issue metadata in (`SWE-Arena/issue_metadata`). The leaderboard is dynamically constructed from the issue metadata. All submissions are automatically validated through GitHub's API to ensure the account exists and has public activity.
 
57
  - Co-authored commits (`co-authored-by:`)
58
 
59
  **Regular Updates**
60
+ The leaderboard refreshes automatically on the 8nd of each month at 12:00 AM UTC.
61
 
62
  **Community Submissions**
63
  Anyone can submit a coding agent to track via the leaderboard. We store agent metadata in Hugging Face datasets (`SWE-Arena/bot_metadata`) and issue metadata in (`SWE-Arena/issue_metadata`). The leaderboard is dynamically constructed from the issue metadata. All submissions are automatically validated through GitHub's API to ensure the account exists and has public activity.
app.py CHANGED
@@ -3,21 +3,17 @@ from gradio_leaderboard import Leaderboard, ColumnFilter
3
  import json
4
  import os
5
  import time
6
- import tempfile
7
  import requests
8
- from datetime import datetime, timezone, timedelta
9
- from collections import defaultdict
10
  from huggingface_hub import HfApi, hf_hub_download
11
  from huggingface_hub.errors import HfHubHTTPError
 
12
  from dotenv import load_dotenv
13
  import pandas as pd
14
- import backoff
15
  import random
16
  import plotly.graph_objects as go
17
  from plotly.subplots import make_subplots
18
  from apscheduler.schedulers.background import BackgroundScheduler
19
  from apscheduler.triggers.cron import CronTrigger
20
- from google.cloud import bigquery
21
 
22
  # Load environment variables
23
  load_dotenv()
@@ -27,10 +23,7 @@ load_dotenv()
27
  # =============================================================================
28
 
29
  AGENTS_REPO = "SWE-Arena/bot_metadata" # HuggingFace dataset for agent metadata
30
- PR_METADATA_REPO = "SWE-Arena/pr_metadata" # HuggingFace dataset for PR metadata
31
- LEADERBOARD_REPO = "SWE-Arena/leaderboard_metadata" # For storing computed leaderboard data
32
- LEADERBOARD_TIME_FRAME_DAYS = 180 # Time frame for constructing leaderboard
33
- UPDATE_TIME_FRAME_DAYS = 30 # Time frame for mining new PRs
34
 
35
  LEADERBOARD_COLUMNS = [
36
  ("Agent Name", "string"),
@@ -40,71 +33,8 @@ LEADERBOARD_COLUMNS = [
40
  ("Acceptance Rate (%)", "number"),
41
  ]
42
 
43
- # Global cache for leaderboard data (loaded once at startup)
44
- _LEADERBOARD_CACHE = None
45
-
46
  # =============================================================================
47
- # JSONL FILE OPERATIONS
48
- # =============================================================================
49
-
50
- def load_jsonl(filename):
51
- """Load JSONL file and return list of dictionaries."""
52
- if not os.path.exists(filename):
53
- return []
54
-
55
- data = []
56
- with open(filename, 'r', encoding='utf-8') as f:
57
- for line in f:
58
- line = line.strip()
59
- if line:
60
- try:
61
- entry = json.loads(line)
62
- data.append(entry)
63
- except json.JSONDecodeError as e:
64
- print(f"Warning: Skipping invalid JSON line: {e}")
65
- return data
66
-
67
-
68
- def save_jsonl(filename, data):
69
- """Save list of dictionaries to JSONL file."""
70
- with open(filename, 'w', encoding='utf-8') as f:
71
- for item in data:
72
- f.write(json.dumps(item) + '\n')
73
-
74
-
75
- def parse_date_string(date_string):
76
- """
77
- Parse date string to datetime object, handling various formats.
78
-
79
- Handles:
80
- - ISO format with 'T' or space between date and time
81
- - Timezone with 'Z' or incomplete offset (+00, -00)
82
- - Complete timezone offset (+00:00, -00:00)
83
-
84
- Args:
85
- date_string: Date string in various formats
86
-
87
- Returns:
88
- datetime object or raises exception
89
- """
90
- if not date_string:
91
- raise ValueError("Empty date string")
92
-
93
- # Replace space with 'T' for ISO format compatibility
94
- date_string = date_string.replace(' ', 'T')
95
-
96
- # Fix incomplete timezone offset (+00 or -00 -> +00:00 or -00:00)
97
- if date_string[-3:-2] in ('+', '-') and ':' not in date_string[-3:]:
98
- date_string = date_string + ':00'
99
-
100
- # Parse the date string (handles both with and without microseconds)
101
- dt = datetime.fromisoformat(date_string.replace('Z', '+00:00'))
102
-
103
- return dt
104
-
105
-
106
- # =============================================================================
107
- # HUGGINGFACE API RETRY WRAPPERS
108
  # =============================================================================
109
 
110
  def is_rate_limit_error(e):
@@ -114,368 +44,123 @@ def is_rate_limit_error(e):
114
  return False
115
 
116
 
117
- def backoff_handler(details):
118
- """Handler to print retry attempt information."""
119
- wait_time = details['wait']
120
- tries = details['tries']
121
- wait_minutes = wait_time / 60
122
- print(f" ⏳ Rate limited. Retrying in {wait_minutes:.1f} minutes ({wait_time:.0f}s) - attempt {tries}/8...")
123
-
124
-
125
  @backoff.on_exception(
126
  backoff.expo,
127
  HfHubHTTPError,
128
- giveup=lambda e: not is_rate_limit_error(e),
129
  max_tries=8,
130
- base=300, # Start at 5 minutes (300 seconds)
131
- max_value=3600, # Cap at 60 minutes (3600 seconds)
132
- jitter=backoff.full_jitter,
133
- on_backoff=backoff_handler
 
 
134
  )
135
  def list_repo_files_with_backoff(api, **kwargs):
136
- """Wrapper for HfApi.list_repo_files with exponential backoff on rate limits."""
137
  return api.list_repo_files(**kwargs)
138
 
139
 
140
  @backoff.on_exception(
141
  backoff.expo,
142
  HfHubHTTPError,
143
- giveup=lambda e: not is_rate_limit_error(e),
144
  max_tries=8,
145
- base=300, # Start at 5 minutes (300 seconds)
146
- max_value=3600, # Cap at 60 minutes (3600 seconds)
147
- jitter=backoff.full_jitter,
148
- on_backoff=backoff_handler
 
 
149
  )
150
  def hf_hub_download_with_backoff(**kwargs):
151
- """Wrapper for hf_hub_download with exponential backoff on rate limits."""
152
  return hf_hub_download(**kwargs)
153
 
154
 
155
- @backoff.on_exception(
156
- backoff.expo,
157
- HfHubHTTPError,
158
- giveup=lambda e: not is_rate_limit_error(e),
159
- max_tries=8,
160
- base=300, # Start at 5 minutes (300 seconds)
161
- max_value=3600, # Cap at 60 minutes (3600 seconds)
162
- jitter=backoff.full_jitter,
163
- on_backoff=backoff_handler
164
- )
165
- def upload_folder_with_backoff(api, **kwargs):
166
- """Wrapper for HfApi.upload_folder with exponential backoff on rate limits."""
167
- return api.upload_folder(**kwargs)
168
-
169
-
170
- @backoff.on_exception(
171
- backoff.expo,
172
- HfHubHTTPError,
173
- giveup=lambda e: not is_rate_limit_error(e),
174
- max_tries=8,
175
- base=300, # Start at 5 minutes (300 seconds)
176
- max_value=3600, # Cap at 60 minutes (3600 seconds)
177
- jitter=backoff.full_jitter,
178
- on_backoff=backoff_handler
179
- )
180
- def upload_file_with_backoff(api, **kwargs):
181
- """Wrapper for HfApi.upload_file with exponential backoff on rate limits."""
182
- return api.upload_file(**kwargs)
183
-
184
-
185
  # =============================================================================
186
- # BIGQUERY FUNCTIONS
187
  # =============================================================================
188
 
189
- def fetch_issue_metadata_batched(client, identifiers, start_date, end_date, batch_size=100, upload_immediately=True):
190
  """
191
- Fetch issue metadata for ALL agents using BATCHED BigQuery queries.
192
- Splits agents into smaller batches to avoid performance issues with large numbers of agents.
193
 
194
- Args:
195
- client: BigQuery client instance
196
- identifiers: List of GitHub usernames/bot identifiers
197
- start_date: Start datetime (timezone-aware)
198
- end_date: End datetime (timezone-aware)
199
- batch_size: Number of agents to process per batch (default: 100)
200
- upload_immediately: If True, upload each batch's results to HuggingFace immediately (default: True)
201
-
202
- Returns:
203
- Dictionary mapping agent identifier to list of issue metadata
204
  """
205
- # Split identifiers into batches
206
- batches = [identifiers[i:i + batch_size] for i in range(0, len(identifiers), batch_size)]
207
- total_batches = len(batches)
208
-
209
- print(f"\n🔍 Using BATCHED approach for {len(identifiers)} agents")
210
- print(f" Total batches: {total_batches} (batch size: {batch_size})")
211
- print(f" Time range: {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}")
212
- if upload_immediately:
213
- print(f" Upload mode: Immediate (after each batch)")
214
- else:
215
- print(f" Upload mode: Deferred (all at once)")
216
-
217
- # Collect results from all batches
218
- all_metadata = {}
219
-
220
- for batch_num, batch_identifiers in enumerate(batches, 1):
221
- print(f"\n📦 Processing batch {batch_num}/{total_batches} ({len(batch_identifiers)} agents)...")
222
-
223
  try:
224
- # Query each batch
225
- batch_results = fetch_all_pr_metadata_single_query(
226
- client, batch_identifiers, start_date, end_date
 
 
 
 
 
227
  )
228
 
229
- # Merge results
230
- for identifier, metadata_list in batch_results.items():
231
- if identifier in all_metadata:
232
- all_metadata[identifier].extend(metadata_list)
233
- else:
234
- all_metadata[identifier] = metadata_list
235
-
236
- print(f" ✓ Batch {batch_num}/{total_batches} complete")
237
-
238
- # Upload immediately after this batch if enabled
239
- if upload_immediately and batch_results:
240
- print(f"\n 📤 Uploading batch {batch_num}/{total_batches} results to HuggingFace...")
241
- upload_success = 0
242
- upload_errors = 0
243
-
244
- for identifier, metadata_list in batch_results.items():
245
- if metadata_list:
246
- if save_pr_metadata_to_hf(metadata_list, identifier):
247
- upload_success += 1
248
- else:
249
- upload_errors += 1
250
-
251
- print(f" ✓ Batch {batch_num}/{total_batches} upload complete ({upload_success} agents uploaded, {upload_errors} errors)")
252
-
253
- except Exception as e:
254
- print(f" ✗ Batch {batch_num}/{total_batches} failed: {str(e)}")
255
- print(f" Continuing with remaining batches...")
256
- continue
257
-
258
- total_prs = sum(len(metadata_list) for metadata_list in all_metadata.values())
259
- print(f"\n✓ All batches complete! Found {total_prs} total PRs across {len(all_metadata)} agents")
260
-
261
- return all_metadata
262
-
263
-
264
- def get_bigquery_client():
265
- """
266
- Initialize BigQuery client using credentials from environment variable.
267
-
268
- Expects GOOGLE_APPLICATION_CREDENTIALS_JSON environment variable containing
269
- the service account JSON credentials as a string.
270
- """
271
- # Get the JSON content from environment variable
272
- creds_json = os.environ.get('GOOGLE_APPLICATION_CREDENTIALS_JSON')
273
-
274
- if creds_json:
275
- # Create a temporary file to store credentials
276
- with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.json') as temp_file:
277
- temp_file.write(creds_json)
278
- temp_path = temp_file.name
279
-
280
- # Set environment variable to point to temp file
281
- os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = temp_path
282
-
283
- # Initialize BigQuery client
284
- client = bigquery.Client()
285
 
286
- # Clean up temp file
287
- os.unlink(temp_path)
 
288
 
289
- return client
290
- else:
291
- raise ValueError("GOOGLE_APPLICATION_CREDENTIALS_JSON not found in environment")
292
-
293
-
294
- def generate_table_union_statements(start_date, end_date):
295
- """
296
- Generate UNION ALL statements for githubarchive.month tables in date range.
297
-
298
- Args:
299
- start_date: Start datetime
300
- end_date: End datetime
301
-
302
- Returns:
303
- String with UNION ALL SELECT statements for all tables in range
304
- """
305
- table_names = []
306
-
307
- # Start from the beginning of start_date's month
308
- current_date = start_date.replace(day=1)
309
- end_month = end_date.replace(day=1)
310
-
311
- while current_date <= end_month:
312
- table_name = f"`githubarchive.month.{current_date.strftime('%Y%m')}`"
313
- table_names.append(table_name)
314
-
315
- # Move to next month
316
- if current_date.month == 12:
317
- current_date = current_date.replace(year=current_date.year + 1, month=1)
318
- else:
319
- current_date = current_date.replace(month=current_date.month + 1)
320
-
321
- # Create UNION ALL chain
322
- union_parts = [f"SELECT * FROM {table}" for table in table_names]
323
- return " UNION ALL ".join(union_parts)
324
-
325
-
326
- def fetch_all_pr_metadata_single_query(client, identifiers, start_date, end_date):
327
- """
328
- Fetch PR metadata for a BATCH of agents using ONE comprehensive BigQuery query.
329
-
330
- NOTE: This function is designed for smaller batches (~100 agents).
331
- For large numbers of agents, use fetch_issue_metadata_batched() instead.
332
-
333
- This query fetches PRs authored by agents (user.login matches identifier).
334
-
335
- Args:
336
- client: BigQuery client instance
337
- identifiers: List of GitHub usernames/bot identifiers
338
- start_date: Start datetime (timezone-aware)
339
- end_date: End datetime (timezone-aware)
340
-
341
- Returns:
342
- Dictionary mapping agent identifier to list of PR metadata
343
- """
344
- print(f" Querying BigQuery for {len(identifiers)} agents in this batch...")
345
- print(f" Time range: {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}")
346
-
347
- # Generate table UNION statements for the time range
348
- table_union = generate_table_union_statements(start_date, end_date)
349
-
350
- # Build identifier list for SQL IN clause (author matching only)
351
- author_list = ', '.join([f"'{id}'" for id in identifiers])
352
-
353
- # Build comprehensive query with CTE
354
- query = f"""
355
- WITH pr_events AS (
356
- -- Get all PR events (opened, closed) for all agents
357
- SELECT
358
- JSON_EXTRACT_SCALAR(payload, '$.pull_request.html_url') as html_url,
359
- JSON_EXTRACT_SCALAR(payload, '$.pull_request.user.login') as pr_author,
360
- JSON_EXTRACT_SCALAR(payload, '$.pull_request.created_at') as created_at,
361
- CAST(JSON_EXTRACT_SCALAR(payload, '$.pull_request.merged') AS BOOL) as is_merged,
362
- JSON_EXTRACT_SCALAR(payload, '$.pull_request.merged_at') as merged_at,
363
- JSON_EXTRACT_SCALAR(payload, '$.pull_request.closed_at') as closed_at,
364
- JSON_EXTRACT_SCALAR(payload, '$.action') as action,
365
- created_at as event_time
366
- FROM (
367
- {table_union}
368
- ) t
369
- WHERE
370
- type = 'PullRequestEvent'
371
- AND JSON_EXTRACT_SCALAR(payload, '$.pull_request.html_url') IS NOT NULL
372
- AND JSON_EXTRACT_SCALAR(payload, '$.pull_request.user.login') IN ({author_list})
373
- ),
374
-
375
- pr_latest_state AS (
376
- -- Get the latest state for each PR (most recent event)
377
- SELECT
378
- html_url,
379
- pr_author,
380
- created_at,
381
- merged_at,
382
- closed_at,
383
- ROW_NUMBER() OVER (PARTITION BY html_url ORDER BY event_time DESC) as row_num
384
- FROM pr_events
385
- )
386
 
387
- -- Return deduplicated PR metadata
388
- SELECT DISTINCT
389
- html_url,
390
- pr_author,
391
- created_at,
392
- merged_at,
393
- closed_at
394
- FROM pr_latest_state
395
- WHERE row_num = 1
396
- ORDER BY created_at DESC
397
- """
398
 
399
- print(f" Scanning {(end_date - start_date).days} days of GitHub Archive data...")
400
- print(f" Batch agents: {', '.join(identifiers[:5])}{'...' if len(identifiers) > 5 else ''}")
 
 
 
 
 
 
 
401
 
402
- try:
403
- query_job = client.query(query)
404
- results = list(query_job.result())
405
-
406
- print(f" ✓ Found {len(results)} PRs in this batch")
407
-
408
- # Group results by agent
409
- metadata_by_agent = defaultdict(list)
410
-
411
- for row in results:
412
- # Convert datetime objects to ISO strings
413
- created_at = row.created_at
414
- if hasattr(created_at, 'isoformat'):
415
- created_at = created_at.isoformat()
416
-
417
- merged_at = row.merged_at
418
- if hasattr(merged_at, 'isoformat'):
419
- merged_at = merged_at.isoformat()
420
-
421
- closed_at = row.closed_at
422
- if hasattr(closed_at, 'isoformat'):
423
- closed_at = closed_at.isoformat()
424
-
425
- pr_data = {
426
- 'html_url': row.html_url,
427
- 'created_at': created_at,
428
- 'merged_at': merged_at,
429
- 'closed_at': closed_at,
430
- }
431
-
432
- # Assign to agent based on author
433
- pr_author = row.pr_author
434
- if pr_author and pr_author in identifiers:
435
- metadata_by_agent[pr_author].append(pr_data)
436
-
437
- # Print breakdown by agent (only show agents with PRs)
438
- print(f" 📊 Batch breakdown:")
439
- for identifier in identifiers:
440
- count = len(metadata_by_agent.get(identifier, []))
441
- if count > 0:
442
- metadata = metadata_by_agent[identifier]
443
- merged_count = sum(1 for m in metadata if m['merged_at'] is not None)
444
- closed_count = sum(1 for m in metadata if m['closed_at'] is not None and m['merged_at'] is None)
445
- open_count = count - merged_count - closed_count
446
- print(f" {identifier}: {count} PRs ({merged_count} merged, {closed_count} closed, {open_count} open)")
447
-
448
- # Convert defaultdict to regular dict
449
- return dict(metadata_by_agent)
450
 
451
- except Exception as e:
452
- print(f" ✗ BigQuery error: {str(e)}")
453
- import traceback
454
- traceback.print_exc()
455
- return {}
 
456
 
 
 
457
 
458
- # =============================================================================
459
- # GITHUB API OPERATIONS (Minimal - Only for Validation)
460
- # =============================================================================
 
 
 
 
461
 
462
- def get_github_token():
463
- """Get first GitHub token from environment variables."""
464
- token = os.getenv('GITHUB_TOKEN')
465
- if not token:
466
- print("Warning: GITHUB_TOKEN not found. Validation will be limited.")
467
- return token
468
 
469
 
470
  def validate_github_username(identifier):
471
- """Verify that a GitHub identifier exists (simple validation)."""
472
  try:
473
- token = get_github_token()
474
- headers = {'Authorization': f'token {token}'} if token else {}
475
  url = f'https://api.github.com/users/{identifier}'
476
-
477
- response = requests.get(url, headers=headers, timeout=10)
478
-
479
  if response.status_code == 200:
480
  return True, "Username is valid"
481
  elif response.status_code == 404:
@@ -486,414 +171,6 @@ def validate_github_username(identifier):
486
  return False, f"Validation error: {str(e)}"
487
 
488
 
489
- # =============================================================================
490
- # PR STATISTICS
491
- # =============================================================================
492
-
493
- def calculate_pr_stats_from_metadata(metadata_list):
494
- """
495
- Calculate statistics from a list of PR metadata (lightweight objects).
496
- Works with minimal metadata: html_url, created_at, merged_at, closed_at.
497
-
498
- Returns a dictionary with comprehensive PR metrics.
499
-
500
- Acceptance rate is calculated as:
501
- merged PRs / (merged PRs + closed but not merged PRs) * 100
502
-
503
- This only counts PRs where a decision has been made (either merged or rejected/closed).
504
- """
505
- total_prs = len(metadata_list)
506
- merged = sum(1 for pr_meta in metadata_list if pr_meta.get('merged_at'))
507
-
508
- # Count closed PRs (rejected) - those with closed_at but no merged_at
509
- closed_not_merged = sum(1 for pr_meta in metadata_list
510
- if pr_meta.get('closed_at') and not pr_meta.get('merged_at'))
511
-
512
- # Total decisions made = merged + closed (rejected)
513
- total_decisions = merged + closed_not_merged
514
-
515
- # Calculate acceptance rate based on decisions made
516
- acceptance_rate = (merged / total_decisions * 100) if total_decisions > 0 else 0
517
-
518
- return {
519
- 'total_prs': total_prs,
520
- 'merged_prs': merged,
521
- 'acceptance_rate': round(acceptance_rate, 2),
522
- }
523
-
524
-
525
- def calculate_monthly_metrics_by_agent(top_n=None):
526
- """
527
- Calculate monthly metrics for all agents (or top N agents) for visualization.
528
- Loads data directly from SWE-Arena/pr_metadata dataset.
529
-
530
- Args:
531
- top_n: If specified, only return metrics for the top N agents by total PRs.
532
- Agents are ranked by their total PR count across all months.
533
-
534
- Returns:
535
- dict: {
536
- 'agents': list of agent names,
537
- 'months': list of month labels (e.g., '2025-01'),
538
- 'data': {
539
- agent_name: {
540
- 'acceptance_rates': list of acceptance rates by month,
541
- 'total_prs': list of PR counts by month,
542
- 'merged_prs': list of merged PR counts by month,
543
- 'closed_not_merged': list of closed but not merged PR counts by month
544
- }
545
- }
546
- }
547
- """
548
- # Load ALL agents from HuggingFace agents repo
549
- agents = load_agents_from_hf()
550
-
551
- # Create mapping from agent_identifier to agent_name
552
- identifier_to_name = {agent.get('github_identifier'): agent.get('name', 'Unknown') for agent in agents if agent.get('github_identifier')}
553
-
554
- # Load all PR metadata from pr_metadata dataset
555
- all_metadata = load_pr_metadata()
556
-
557
- if not all_metadata:
558
- return {'agents': [], 'months': [], 'data': {}}
559
-
560
- # Group by agent and month
561
- agent_month_data = defaultdict(lambda: defaultdict(list))
562
-
563
- for pr_meta in all_metadata:
564
- agent_identifier = pr_meta.get('agent_identifier')
565
- created_at = pr_meta.get('created_at')
566
-
567
- if not agent_identifier or not created_at:
568
- continue
569
-
570
- # Get agent_name from identifier
571
- agent_name = identifier_to_name.get(agent_identifier, agent_identifier)
572
-
573
- try:
574
- dt = parse_date_string(created_at)
575
- month_key = f"{dt.year}-{dt.month:02d}"
576
- agent_month_data[agent_name][month_key].append(pr_meta)
577
- except Exception as e:
578
- print(f"Warning: Could not parse date '{created_at}': {e}")
579
- continue
580
-
581
- # Get all unique months and sort them
582
- all_months = set()
583
- for agent_data in agent_month_data.values():
584
- all_months.update(agent_data.keys())
585
- months = sorted(list(all_months))
586
-
587
- # Calculate metrics for each agent and month
588
- result_data = {}
589
- for agent_name, month_dict in agent_month_data.items():
590
- acceptance_rates = []
591
- total_prs = []
592
- merged_prs = []
593
- closed_not_merged_list = []
594
-
595
- for month in months:
596
- prs_in_month = month_dict.get(month, [])
597
-
598
- # Count merged PRs
599
- merged_count = sum(1 for pr in prs_in_month if pr.get('merged_at'))
600
-
601
- # Count closed but not merged
602
- closed_not_merged_count = sum(1 for pr in prs_in_month
603
- if pr.get('closed_at') and not pr.get('merged_at'))
604
-
605
- # Total PRs created in this month
606
- total_count = len(prs_in_month)
607
-
608
- # Calculate acceptance rate
609
- total_decisions = merged_count + closed_not_merged_count
610
- acceptance_rate = (merged_count / total_decisions * 100) if total_decisions > 0 else None
611
-
612
- acceptance_rates.append(acceptance_rate)
613
- total_prs.append(total_count)
614
- merged_prs.append(merged_count)
615
- closed_not_merged_list.append(closed_not_merged_count)
616
-
617
- result_data[agent_name] = {
618
- 'acceptance_rates': acceptance_rates,
619
- 'total_prs': total_prs,
620
- 'merged_prs': merged_prs,
621
- 'closed_not_merged': closed_not_merged_list
622
- }
623
-
624
- # Filter to top N agents if specified
625
- agents_list = sorted(list(agent_month_data.keys()))
626
- if top_n is not None and top_n > 0:
627
- # Calculate total PRs for each agent across all months
628
- agent_totals = []
629
- for agent_name in agents_list:
630
- total_pr_count = sum(result_data[agent_name]['total_prs'])
631
- agent_totals.append((agent_name, total_pr_count))
632
-
633
- # Sort by total PRs (descending) and take top N
634
- agent_totals.sort(key=lambda x: x[1], reverse=True)
635
- top_agents = [agent_name for agent_name, _ in agent_totals[:top_n]]
636
-
637
- # Filter result_data to only include top agents
638
- result_data = {agent: result_data[agent] for agent in top_agents if agent in result_data}
639
- agents_list = top_agents
640
-
641
- return {
642
- 'agents': agents_list,
643
- 'months': months,
644
- 'data': result_data
645
- }
646
-
647
-
648
- # =============================================================================
649
- # PR METADATA STORAGE & RETRIEVAL
650
- # =============================================================================
651
-
652
- def group_metadata_by_date(metadata_list):
653
- """
654
- Group PR metadata by exact date (year.month.day) for efficient daily storage.
655
- Returns dict: {(year, month, day): [metadata_list]}
656
- """
657
- grouped = defaultdict(list)
658
-
659
- for pr_meta in metadata_list:
660
- created_at = pr_meta.get('created_at')
661
- if not created_at:
662
- continue
663
-
664
- try:
665
- dt = parse_date_string(created_at)
666
- key = (dt.year, dt.month, dt.day)
667
- grouped[key].append(pr_meta)
668
- except Exception as e:
669
- print(f"Warning: Could not parse date '{created_at}': {e}")
670
-
671
- return dict(grouped)
672
-
673
-
674
- def save_pr_metadata_to_hf(metadata_list, agent_identifier):
675
- """
676
- Save PR metadata to HuggingFace dataset, organized by [agent_identifier]/YYYY.MM.DD.jsonl.
677
- Each file is stored in the agent's folder and named YYYY.MM.DD.jsonl for that day's PRs.
678
-
679
- This function OVERWRITES existing files completely with fresh data from BigQuery.
680
- Uses batch upload to avoid rate limit (uploads entire folder in single operation).
681
-
682
- Args:
683
- metadata_list: List of PR metadata dictionaries
684
- agent_identifier: GitHub identifier of the agent (used as folder name)
685
- """
686
- import shutil
687
-
688
- try:
689
- token = get_hf_token()
690
- if not token:
691
- raise Exception("No HuggingFace token found")
692
-
693
- api = HfApi(token=token)
694
-
695
- # Group by date (year, month, day)
696
- grouped = group_metadata_by_date(metadata_list)
697
-
698
- if not grouped:
699
- print(f" No valid metadata to save for {agent_identifier}")
700
- return False
701
-
702
- # Create a temporary directory for batch upload
703
- temp_dir = tempfile.mkdtemp()
704
- agent_folder = os.path.join(temp_dir, agent_identifier)
705
- os.makedirs(agent_folder, exist_ok=True)
706
-
707
- try:
708
- print(f" 📦 Preparing batch upload for {len(grouped)} daily files...")
709
-
710
- # Process each daily file
711
- for (pr_year, month, day), day_metadata in grouped.items():
712
- filename = f"{agent_identifier}/{pr_year}.{month:02d}.{day:02d}.jsonl"
713
- local_filename = os.path.join(agent_folder, f"{pr_year}.{month:02d}.{day:02d}.jsonl")
714
-
715
- # Sort by created_at for better organization
716
- day_metadata.sort(key=lambda x: x.get('created_at', ''), reverse=True)
717
-
718
- # Save to temp directory (complete overwrite, no merging)
719
- save_jsonl(local_filename, day_metadata)
720
- print(f" Prepared {len(day_metadata)} PRs for {filename}")
721
-
722
- # Upload entire folder using upload_folder (single commit per agent)
723
- print(f" 📤 Uploading {len(grouped)} files ({len(metadata_list)} total PRs)...")
724
- upload_folder_with_backoff(
725
- api,
726
- folder_path=temp_dir,
727
- repo_id=PR_METADATA_REPO,
728
- repo_type="dataset",
729
- commit_message=f"Update PR metadata for {agent_identifier}"
730
- )
731
- print(f" ✓ Batch upload complete for {agent_identifier}")
732
-
733
- return True
734
-
735
- finally:
736
- # Always clean up temp directory
737
- if os.path.exists(temp_dir):
738
- shutil.rmtree(temp_dir)
739
-
740
- except Exception as e:
741
- print(f" ✗ Error saving PR metadata: {str(e)}")
742
- import traceback
743
- traceback.print_exc()
744
- return False
745
-
746
-
747
- def load_pr_metadata():
748
- """
749
- Loads PR metadata from the last LEADERBOARD_TIME_FRAME_DAYS only.
750
-
751
- Structure: [agent_identifier]/YYYY.MM.DD.jsonl
752
-
753
- Returns:
754
- List of dictionaries with 'agent_identifier' added to each PR metadata.
755
- Only includes PRs within the last LEADERBOARD_TIME_FRAME_DAYS.
756
- """
757
- try:
758
- api = HfApi()
759
- token = get_hf_token()
760
-
761
- # Calculate cutoff date for filtering
762
- cutoff_date = datetime.now(timezone.utc) - timedelta(days=LEADERBOARD_TIME_FRAME_DAYS)
763
-
764
- # List all files in the repository
765
- files = list_repo_files_with_backoff(api, repo_id=PR_METADATA_REPO, repo_type="dataset")
766
-
767
- # Filter for files within the time frame: [agent_identifier]/YYYY.MM.DD.jsonl
768
- # Parse date from filename and only include files within LEADERBOARD_TIME_FRAME_DAYS
769
- relevant_files = []
770
- for f in files:
771
- if f.endswith('.jsonl'):
772
- parts = f.split('/')
773
- if len(parts) == 2: # [agent_identifier]/YYYY.MM.DD.jsonl
774
- filename = parts[1]
775
- try:
776
- # Parse date from filename: YYYY.MM.DD.jsonl
777
- date_part = filename.replace('.jsonl', '') # Get YYYY.MM.DD
778
- date_components = date_part.split('.')
779
- if len(date_components) == 3:
780
- file_year, file_month, file_day = map(int, date_components)
781
- file_date = datetime(file_year, file_month, file_day, tzinfo=timezone.utc)
782
-
783
- # Only include files within the time frame
784
- if file_date >= cutoff_date:
785
- relevant_files.append(f)
786
- except Exception:
787
- # If date parsing fails, skip this file
788
- continue
789
-
790
- total_months = LEADERBOARD_TIME_FRAME_DAYS // 30
791
- print(f"📥 Loading PR metadata from last {total_months} months ({len(relevant_files)} daily files across all agents)...")
792
-
793
- all_metadata = []
794
- for filename in relevant_files:
795
- try:
796
- # Extract agent_identifier from path (first part)
797
- # Format: agent_identifier/YYYY.MM.DD.jsonl
798
- parts = filename.split('/')
799
- if len(parts) != 2:
800
- print(f" Warning: Unexpected filename format: {filename}")
801
- continue
802
-
803
- agent_identifier = parts[0]
804
-
805
- file_path = hf_hub_download_with_backoff(
806
- repo_id=PR_METADATA_REPO,
807
- filename=filename,
808
- repo_type="dataset",
809
- token=token
810
- )
811
- day_metadata = load_jsonl(file_path)
812
-
813
- # Filter individual PRs by created_at date as a double-check
814
- for pr_meta in day_metadata:
815
- created_at = pr_meta.get('created_at')
816
- if created_at:
817
- try:
818
- dt = parse_date_string(created_at)
819
- if dt >= cutoff_date:
820
- pr_meta['agent_identifier'] = agent_identifier
821
- all_metadata.append(pr_meta)
822
- except Exception:
823
- # If date parsing fails, skip this PR
824
- continue
825
- else:
826
- # If no created_at, skip this PR
827
- continue
828
-
829
- print(f" ✓ Loaded PRs from {filename}")
830
- except Exception as e:
831
- print(f" Warning: Could not load {filename}: {str(e)}")
832
-
833
- print(f"✓ Loaded {len(all_metadata)} total PRs from last {total_months} months")
834
- return all_metadata
835
-
836
- except Exception as e:
837
- total_months = LEADERBOARD_TIME_FRAME_DAYS // 30
838
- print(f"✗ Error loading PR metadata from last {total_months} months: {str(e)}")
839
- return []
840
-
841
-
842
- def get_daily_files_last_time_frame(agent_identifier):
843
- """
844
- Get list of daily file paths for an agent from the configured time frame.
845
-
846
- Args:
847
- agent_identifier: GitHub identifier of the agent
848
-
849
- Returns:
850
- List of file paths in format: [agent_identifier]/YYYY.MM.DD.jsonl
851
- """
852
- try:
853
- api = HfApi()
854
- token = get_hf_token()
855
-
856
- # Calculate date range using configured time frame
857
- today = datetime.now(timezone.utc)
858
- cutoff_date = today - timedelta(days=LEADERBOARD_TIME_FRAME_DAYS)
859
-
860
- # List all files in the repository
861
- files = list_repo_files_with_backoff(api, repo_id=PR_METADATA_REPO, repo_type="dataset")
862
-
863
- # Filter for files in this agent's folder
864
- agent_pattern = f"{agent_identifier}/"
865
- agent_files = [f for f in files if f.startswith(agent_pattern) and f.endswith('.jsonl')]
866
-
867
- # Filter by date range (extract date from filename)
868
- recent_files = []
869
- for filename in agent_files:
870
- try:
871
- # Extract date from filename: YYYY.MM.DD.jsonl
872
- parts = filename.split('/')
873
- if len(parts) != 2:
874
- continue
875
-
876
- date_part = parts[1].replace('.jsonl', '') # Get YYYY.MM.DD
877
- date_components = date_part.split('.')
878
- if len(date_components) != 3:
879
- continue
880
-
881
- file_year, file_month, file_day = map(int, date_components)
882
- file_date = datetime(file_year, file_month, file_day, tzinfo=timezone.utc)
883
-
884
- # Include if within configured time frame
885
- if cutoff_date <= file_date <= today:
886
- recent_files.append(filename)
887
- except Exception:
888
- continue
889
-
890
- return recent_files
891
-
892
- except Exception as e:
893
- print(f"Error getting daily files: {str(e)}")
894
- return []
895
-
896
-
897
  # =============================================================================
898
  # HUGGINGFACE DATASET OPERATIONS
899
  # =============================================================================
@@ -905,13 +182,11 @@ def load_agents_from_hf():
905
  agents = []
906
 
907
  # List all files in the repository
908
- files = list_repo_files_with_backoff(api, repo_id=AGENTS_REPO, repo_type="dataset")
909
 
910
  # Filter for JSON files only
911
  json_files = [f for f in files if f.endswith('.json')]
912
 
913
- print(f"Found {len(json_files)} agent files in {AGENTS_REPO}")
914
-
915
  # Download and parse each JSON file
916
  for json_file in json_files:
917
  try:
@@ -928,9 +203,11 @@ def load_agents_from_hf():
928
  if agent_data.get('status') != 'public':
929
  continue
930
 
931
- # Extract github_identifier from filename (remove .json extension)
932
- github_identifier = json_file.replace('.json', '')
933
- agent_data['github_identifier'] = github_identifier
 
 
934
 
935
  agents.append(agent_data)
936
 
@@ -938,7 +215,7 @@ def load_agents_from_hf():
938
  print(f"Warning: Could not load {json_file}: {str(e)}")
939
  continue
940
 
941
- print(f"Loaded {len(agents)} agents from HuggingFace")
942
  return agents
943
 
944
  except Exception as e:
@@ -954,37 +231,6 @@ def get_hf_token():
954
  return token
955
 
956
 
957
- def load_leaderboard_data_from_hf():
958
- """
959
- Load pre-computed leaderboard and monthly metrics data from HuggingFace.
960
-
961
- Returns:
962
- Dictionary with 'leaderboard', 'monthly_metrics', and 'last_updated' keys.
963
- Returns None if file doesn't exist or error occurs.
964
- """
965
- try:
966
- token = get_hf_token()
967
-
968
- # Download the swe-pr.json file
969
- file_path = hf_hub_download_with_backoff(
970
- repo_id=LEADERBOARD_REPO,
971
- filename="swe-pr.json",
972
- repo_type="dataset",
973
- token=token
974
- )
975
-
976
- with open(file_path, 'r') as f:
977
- data = json.load(f)
978
-
979
- print(f"✓ Loaded leaderboard data (last updated: {data.get('last_updated', 'Unknown')})")
980
- return data
981
-
982
- except Exception as e:
983
- print(f"⚠️ Could not load leaderboard data from HuggingFace: {str(e)}")
984
- print(f" Falling back to computing from raw PR metadata...")
985
- return None
986
-
987
-
988
  def upload_with_retry(api, path_or_fileobj, path_in_repo, repo_id, repo_type, token, max_retries=5):
989
  """
990
  Upload file to HuggingFace with exponential backoff retry logic.
@@ -1013,18 +259,18 @@ def upload_with_retry(api, path_or_fileobj, path_in_repo, repo_id, repo_type, to
1013
  token=token
1014
  )
1015
  if attempt > 0:
1016
- print(f" Upload succeeded on attempt {attempt + 1}/{max_retries}")
1017
  return True
1018
 
1019
  except Exception as e:
1020
  if attempt < max_retries - 1:
1021
  wait_time = delay + random.uniform(0, 1.0)
1022
- print(f" ⚠️ Upload failed (attempt {attempt + 1}/{max_retries}): {str(e)}")
1023
- print(f" Retrying in {wait_time:.1f} seconds...")
1024
  time.sleep(wait_time)
1025
  delay = min(delay * 2, 60.0) # Exponential backoff, max 60s
1026
  else:
1027
- print(f" Upload failed after {max_retries} attempts: {str(e)}")
1028
  raise
1029
 
1030
 
@@ -1054,7 +300,7 @@ def save_agent_to_hf(data):
1054
  repo_type="dataset",
1055
  token=token
1056
  )
1057
- print(f"Saved agent to HuggingFace: {filename}")
1058
  return True
1059
  finally:
1060
  # Always clean up local file, even if upload fails
@@ -1062,208 +308,52 @@ def save_agent_to_hf(data):
1062
  os.remove(filename)
1063
 
1064
  except Exception as e:
1065
- print(f"Error saving agent: {str(e)}")
1066
  return False
1067
 
1068
 
1069
- def save_leaderboard_and_metrics_to_hf():
1070
  """
1071
- Creates a comprehensive JSON file with both leaderboard stats and monthly metrics.
1072
- If the file exists, it will be overwritten.
1073
 
1074
  Returns:
1075
- bool: True if successful, False otherwise
 
1076
  """
1077
- import io
1078
-
1079
  try:
1080
  token = get_hf_token()
1081
- if not token:
1082
- raise Exception("No HuggingFace token found")
1083
-
1084
- api = HfApi(token=token)
1085
-
1086
- print(f"\n{'='*80}")
1087
- print(f"📊 Preparing leaderboard and metrics data for upload...")
1088
- print(f"{'='*80}\n")
1089
-
1090
- # Get leaderboard data
1091
- print(" Constructing leaderboard data...")
1092
- leaderboard_data = construct_leaderboard_from_metadata()
1093
-
1094
- # Get monthly metrics data (all agents, not just top N)
1095
- print(" Calculating monthly metrics...")
1096
- monthly_metrics = calculate_monthly_metrics_by_agent(top_n=None)
1097
-
1098
- # Combine into a single structure
1099
- combined_data = {
1100
- "leaderboard": leaderboard_data,
1101
- "monthly_metrics": monthly_metrics,
1102
- "metadata": {
1103
- "last_updated": datetime.now(timezone.utc).isoformat(),
1104
- "time_frame_days": LEADERBOARD_TIME_FRAME_DAYS,
1105
- "total_agents": len(leaderboard_data)
1106
- }
1107
- }
1108
-
1109
- print(f" Leaderboard entries: {len(leaderboard_data)}")
1110
- print(f" Monthly metrics for: {len(monthly_metrics['agents'])} agents")
1111
- print(f" Time frame: {LEADERBOARD_TIME_FRAME_DAYS} days")
1112
-
1113
- # Convert to JSON and create file-like object
1114
- json_content = json.dumps(combined_data, indent=2)
1115
- file_like_object = io.BytesIO(json_content.encode('utf-8'))
1116
 
1117
- # Upload to HuggingFace (will overwrite if exists)
1118
- print(f"\n🤗 Uploading to {LEADERBOARD_REPO}...")
1119
- upload_file_with_backoff(
1120
- api,
1121
- path_or_fileobj=file_like_object,
1122
- path_in_repo="swe-pr.json",
1123
  repo_id=LEADERBOARD_REPO,
 
1124
  repo_type="dataset",
1125
- token=token,
1126
- commit_message=f"Update leaderboard data - {datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S')} UTC"
1127
  )
1128
 
1129
- print(f" ✓ Successfully uploaded swe-pr.json")
1130
- print(f"{'='*80}\n")
1131
-
1132
- return True
1133
-
1134
- except Exception as e:
1135
- print(f" ✗ Error saving leaderboard data: {str(e)}")
1136
- import traceback
1137
- traceback.print_exc()
1138
- return False
1139
-
1140
-
1141
- # =============================================================================
1142
- # DATA MANAGEMENT
1143
- # =============================================================================
1144
-
1145
- def mine_all_agents():
1146
- """
1147
- Mine PR metadata for all agents within UPDATE_TIME_FRAME_DAYS and save to HuggingFace.
1148
- Uses BATCHED BigQuery queries for all agents (efficient approach).
1149
- """
1150
- # Load agent metadata from HuggingFace
1151
- agents = load_agents_from_hf()
1152
- if not agents:
1153
- print("No agents found in HuggingFace dataset")
1154
- return
1155
-
1156
- # Extract all identifiers
1157
- identifiers = [agent['github_identifier'] for agent in agents if agent.get('github_identifier')]
1158
- if not identifiers:
1159
- print("No valid agent identifiers found")
1160
- return
1161
-
1162
- print(f"\n{'='*80}")
1163
- print(f"Starting PR metadata mining for {len(identifiers)} agents")
1164
- print(f"Time frame: Last {UPDATE_TIME_FRAME_DAYS} days")
1165
- print(f"Data source: BigQuery + GitHub Archive (BATCHED QUERIES)")
1166
- print(f"{'='*80}\n")
1167
-
1168
- # Initialize BigQuery client
1169
- try:
1170
- client = get_bigquery_client()
1171
- except Exception as e:
1172
- print(f"✗ Failed to initialize BigQuery client: {str(e)}")
1173
- return
1174
-
1175
- # Define time range: past UPDATE_TIME_FRAME_DAYS (excluding today)
1176
- current_time = datetime.now(timezone.utc)
1177
- end_date = current_time.replace(hour=0, minute=0, second=0, microsecond=0)
1178
- start_date = end_date - timedelta(days=UPDATE_TIME_FRAME_DAYS)
1179
-
1180
- try:
1181
- # Use batched approach for better performance
1182
- # upload_immediately=True means each batch uploads to HuggingFace right after BigQuery completes
1183
- all_metadata = fetch_issue_metadata_batched(
1184
- client, identifiers, start_date, end_date, batch_size=100, upload_immediately=True
1185
- )
1186
 
1187
- # Calculate summary statistics
1188
- total_prs = sum(len(metadata_list) for metadata_list in all_metadata.values())
1189
- agents_with_data = sum(1 for metadata_list in all_metadata.values() if metadata_list)
1190
 
1191
- print(f"\n{'='*80}")
1192
- print(f"✅ BigQuery mining and upload complete!")
1193
- print(f" Total agents: {len(agents)}")
1194
- print(f" Agents with data: {agents_with_data}")
1195
- print(f" Total PRs found: {total_prs}")
1196
- print(f"{'='*80}\n")
1197
 
1198
  except Exception as e:
1199
- print(f" Error during BigQuery fetch: {str(e)}")
1200
- import traceback
1201
- traceback.print_exc()
1202
- return
1203
-
1204
- # After mining is complete, save leaderboard and metrics to HuggingFace
1205
- print(f"📤 Uploading leaderboard and metrics data...")
1206
- if save_leaderboard_and_metrics_to_hf():
1207
- print(f"✓ Leaderboard and metrics successfully uploaded to {LEADERBOARD_REPO}")
1208
- else:
1209
- print(f"⚠️ Failed to upload leaderboard and metrics data")
1210
-
1211
-
1212
- def construct_leaderboard_from_metadata():
1213
- """
1214
- Construct leaderboard from stored PR metadata instead of fetching all PRs.
1215
- Much more memory-efficient and faster.
1216
-
1217
- Returns dictionary of agent stats.
1218
- """
1219
- print("📊 Constructing leaderboard from PR metadata...")
1220
- # Load agents
1221
- agents = load_agents_from_hf()
1222
- if not agents:
1223
- print("No agents found")
1224
- return {}
1225
-
1226
- # Load all PR metadata
1227
- all_metadata = load_pr_metadata()
1228
-
1229
- cache_dict = {}
1230
-
1231
- for agent in agents:
1232
- identifier = agent.get('github_identifier')
1233
- agent_name = agent.get('name', 'Unknown')
1234
-
1235
- # Filter metadata for this agent
1236
- bot_metadata = [pr for pr in all_metadata if pr.get('agent_identifier') == identifier]
1237
-
1238
- # Calculate stats
1239
- stats = calculate_pr_stats_from_metadata(bot_metadata)
1240
-
1241
- cache_dict[identifier] = {
1242
- 'name': agent_name,
1243
- 'website': agent.get('website', 'Unknown'),
1244
- 'github_identifier': identifier,
1245
- **stats
1246
- }
1247
-
1248
- return cache_dict
1249
 
1250
 
1251
  # =============================================================================
1252
  # UI FUNCTIONS
1253
  # =============================================================================
1254
 
1255
- def generate_color(index, total):
1256
- """Generate distinct colors using HSL color space for better distribution"""
1257
- hue = (index * 360 / total) % 360
1258
- saturation = 70 + (index % 3) * 10 # Vary saturation slightly
1259
- lightness = 45 + (index % 2) * 10 # Vary lightness slightly
1260
- return f'hsl({hue}, {saturation}%, {lightness}%)'
1261
-
1262
-
1263
  def create_monthly_metrics_plot(top_n=5):
1264
  """
1265
  Create a Plotly figure with dual y-axes showing:
1266
- - Left y-axis: Acceptance rate (%) as line curves
1267
  - Right y-axis: Total PRs created as bar charts
1268
 
1269
  Each agent gets a unique color for both their line and bars.
@@ -1271,37 +361,47 @@ def create_monthly_metrics_plot(top_n=5):
1271
  Args:
1272
  top_n: Number of top agents to show (default: 5)
1273
  """
1274
- global _LEADERBOARD_CACHE
1275
-
1276
- # Load from cache if available
1277
- if _LEADERBOARD_CACHE is not None:
1278
- metrics = _LEADERBOARD_CACHE.get('monthly_metrics', {})
1279
-
1280
- # Apply top_n filter if specified
1281
- if top_n is not None and top_n > 0 and metrics.get('agents'):
1282
- agents_list = metrics['agents']
1283
- data = metrics['data']
1284
-
1285
- # Calculate total PRs for each agent across all months
1286
- agent_totals = []
1287
- for agent_name in agents_list:
1288
- total_pr_count = sum(data[agent_name]['total_prs'])
1289
- agent_totals.append((agent_name, total_pr_count))
1290
-
1291
- # Sort by total PRs (descending) and take top N
1292
- agent_totals.sort(key=lambda x: x[1], reverse=True)
1293
- top_agents = [agent_name for agent_name, _ in agent_totals[:top_n]]
1294
-
1295
- # Filter result_data to only include top agents
1296
- filtered_data = {agent: data[agent] for agent in top_agents if agent in data}
1297
- metrics = {
1298
- 'agents': top_agents,
1299
- 'months': metrics['months'],
1300
- 'data': filtered_data
1301
- }
1302
- else:
1303
- # Fallback: compute from PR metadata
1304
- metrics = calculate_monthly_metrics_by_agent(top_n=top_n)
 
 
 
 
 
 
 
 
 
 
1305
 
1306
  if not metrics['agents'] or not metrics['months']:
1307
  # Return an empty figure with a message
@@ -1322,11 +422,19 @@ def create_monthly_metrics_plot(top_n=5):
1322
  # Create figure with secondary y-axis
1323
  fig = make_subplots(specs=[[{"secondary_y": True}]])
1324
 
 
 
 
 
 
 
 
 
1325
  agents = metrics['agents']
1326
  months = metrics['months']
1327
  data = metrics['data']
1328
 
1329
- # Generate colors for all agents using HSL
1330
  agent_colors = {agent: generate_color(idx, len(agents)) for idx, agent in enumerate(agents)}
1331
 
1332
  # Add traces for each agent
@@ -1348,10 +456,11 @@ def create_monthly_metrics_plot(top_n=5):
1348
  name=agent_name,
1349
  mode='lines+markers',
1350
  line=dict(color=color, width=2),
1351
- marker=dict(size=6),
1352
  legendgroup=agent_name,
1353
- showlegend=True,
1354
- hovertemplate='<b>%{fullData.name}</b><br>' +
 
1355
  'Acceptance Rate: %{y:.2f}%<br>' +
1356
  '<extra></extra>'
1357
  ),
@@ -1375,8 +484,9 @@ def create_monthly_metrics_plot(top_n=5):
1375
  name=agent_name,
1376
  marker=dict(color=color, opacity=0.6),
1377
  legendgroup=agent_name,
1378
- showlegend=False, # Don't show in legend (already shown for line)
1379
- hovertemplate='<b>%{fullData.name}</b><br>' +
 
1380
  'Total PRs: %{y}<br>' +
1381
  '<extra></extra>',
1382
  offsetgroup=agent_name # Group bars by agent for proper spacing
@@ -1386,23 +496,26 @@ def create_monthly_metrics_plot(top_n=5):
1386
 
1387
  # Update axes labels
1388
  fig.update_xaxes(title_text=None)
1389
- fig.update_yaxes(title_text="<b>Acceptance Rate (%)</b>", secondary_y=False)
 
 
 
 
 
 
 
 
1390
  fig.update_yaxes(title_text="<b>Total PRs</b>", secondary_y=True)
1391
 
1392
  # Update layout
 
1393
  fig.update_layout(
1394
  title=None,
1395
- hovermode='closest',
1396
  barmode='group',
1397
  height=600,
1398
- legend=dict(
1399
- orientation="h",
1400
- yanchor="bottom",
1401
- y=1.02,
1402
- xanchor="right",
1403
- x=1
1404
- ),
1405
- margin=dict(l=50, r=50, t=100, b=50)
1406
  )
1407
 
1408
  return fig
@@ -1410,36 +523,51 @@ def create_monthly_metrics_plot(top_n=5):
1410
 
1411
  def get_leaderboard_dataframe():
1412
  """
1413
- Load leaderboard data from cached JSON and convert to pandas DataFrame for display.
1414
- Falls back to computing from PR metadata if cache is not available.
1415
  Returns formatted DataFrame sorted by total PRs.
1416
  """
1417
- global _LEADERBOARD_CACHE
 
 
 
 
 
 
 
1418
 
1419
- # Load from cache if available
1420
- if _LEADERBOARD_CACHE is not None:
1421
- cache_dict = _LEADERBOARD_CACHE.get('leaderboard', {})
1422
- else:
1423
- # Fallback: compute from PR metadata
1424
- cache_dict = construct_leaderboard_from_metadata()
1425
 
1426
  if not cache_dict:
 
1427
  # Return empty DataFrame with correct columns if no data
1428
  column_names = [col[0] for col in LEADERBOARD_COLUMNS]
1429
  return pd.DataFrame(columns=column_names)
1430
 
1431
  rows = []
 
1432
  for identifier, data in cache_dict.items():
 
 
 
1433
  # Filter out agents with zero total PRs
1434
- if data.get('total_prs', 0) > 0:
1435
- # Only include display-relevant fields
1436
- rows.append([
1437
- data.get('name', 'Unknown'),
1438
- data.get('website', 'Unknown'),
1439
- data.get('total_prs', 0),
1440
- data.get('merged_prs', 0),
1441
- data.get('acceptance_rate', 0.0),
1442
- ])
 
 
 
 
 
 
1443
 
1444
  # Create DataFrame
1445
  column_names = [col[0] for col in LEADERBOARD_COLUMNS]
@@ -1455,111 +583,125 @@ def get_leaderboard_dataframe():
1455
  if "Total PRs" in df.columns and not df.empty:
1456
  df = df.sort_values(by="Total PRs", ascending=False).reset_index(drop=True)
1457
 
 
 
 
1458
  return df
1459
 
1460
 
1461
- def submit_agent(identifier, agent_name, organization, description, website):
1462
  """
1463
  Submit a new agent to the leaderboard.
1464
  Validates input and saves submission.
1465
- PR data will be populated by the monthly mining task.
1466
  """
1467
  # Validate required fields
1468
  if not identifier or not identifier.strip():
1469
- return " GitHub identifier is required", get_leaderboard_dataframe(), create_monthly_metrics_plot()
1470
  if not agent_name or not agent_name.strip():
1471
- return " Agent name is required", get_leaderboard_dataframe(), create_monthly_metrics_plot()
1472
  if not organization or not organization.strip():
1473
- return " Organization name is required", get_leaderboard_dataframe(), create_monthly_metrics_plot()
1474
  if not website or not website.strip():
1475
- return " Website URL is required", get_leaderboard_dataframe(), create_monthly_metrics_plot()
1476
 
1477
  # Clean inputs
1478
  identifier = identifier.strip()
1479
  agent_name = agent_name.strip()
1480
  organization = organization.strip()
1481
- description = description.strip()
1482
  website = website.strip()
1483
 
1484
  # Validate GitHub identifier
1485
  is_valid, message = validate_github_username(identifier)
1486
  if not is_valid:
1487
- return f" {message}", get_leaderboard_dataframe(), create_monthly_metrics_plot()
1488
 
1489
  # Check for duplicates by loading agents from HuggingFace
1490
  agents = load_agents_from_hf()
1491
  if agents:
1492
  existing_names = {agent['github_identifier'] for agent in agents}
1493
  if identifier in existing_names:
1494
- return f"⚠️ Agent with identifier '{identifier}' already exists", get_leaderboard_dataframe(), create_monthly_metrics_plot()
1495
 
1496
  # Create submission
1497
  submission = {
1498
  'name': agent_name,
1499
  'organization': organization,
1500
  'github_identifier': identifier,
1501
- 'description': description,
1502
  'website': website,
 
1503
  }
1504
 
1505
  # Save to HuggingFace
1506
  if not save_agent_to_hf(submission):
1507
- return " Failed to save submission", get_leaderboard_dataframe(), create_monthly_metrics_plot()
 
 
 
1508
 
1509
- success_msg = f"✅ Successfully submitted {agent_name}!\n\nPR data will be populated by the monthly mining task (runs every 1st of the month at 12:00 AM UTC)."
1510
- return success_msg, get_leaderboard_dataframe(), create_monthly_metrics_plot()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1511
 
1512
 
1513
  # =============================================================================
1514
  # GRADIO APPLICATION
1515
  # =============================================================================
1516
 
1517
- print(f"\n🚀 Starting SWE Agent PR Leaderboard")
1518
- print(f" Leaderboard time frame: {LEADERBOARD_TIME_FRAME_DAYS} days ({LEADERBOARD_TIME_FRAME_DAYS // 30} months)")
1519
- print(f" Mining update frequency: Every {UPDATE_TIME_FRAME_DAYS} days\n")
1520
 
1521
- # Start APScheduler for monthly PR mining at 12:00 AM UTC every 1st of the month
1522
  scheduler = BackgroundScheduler(timezone="UTC")
1523
  scheduler.add_job(
1524
- mine_all_agents,
1525
- trigger=CronTrigger(day=1, hour=0, minute=0), # 12:00 AM UTC every 1st of the month
1526
- id='monthly_pr_mining',
1527
- name='Monthly PR Mining',
1528
  replace_existing=True
1529
  )
1530
  scheduler.start()
1531
  print(f"\n{'='*80}")
1532
- print(f"Scheduler initialized successfully")
1533
- print(f"⛏️ Mining schedule: Every 1st of the month at 12:00 AM UTC")
1534
- print(f"📥 On startup: Only loads cached data from HuggingFace (no mining)")
1535
  print(f"{'='*80}\n")
1536
 
1537
- # Load leaderboard data from HuggingFace at startup
1538
- print(f"📥 Loading leaderboard data from HuggingFace...")
1539
- _LEADERBOARD_CACHE = load_leaderboard_data_from_hf()
1540
-
1541
- if _LEADERBOARD_CACHE is None:
1542
- print(f"⚠️ No cached leaderboard data found - will compute from raw PR metadata")
1543
- else:
1544
- print(f"✓ Leaderboard cache loaded successfully")
1545
-
1546
- print()
1547
-
1548
  # Create Gradio interface
1549
  with gr.Blocks(title="SWE Agent PR Leaderboard", theme=gr.themes.Soft()) as app:
1550
- total_months = LEADERBOARD_TIME_FRAME_DAYS // 30
1551
-
1552
- gr.Markdown("# 🏆 SWE Agent PR Leaderboard")
1553
  gr.Markdown(f"Track and compare GitHub pull request statistics for SWE agents")
1554
 
1555
  with gr.Tabs():
1556
 
1557
  # Leaderboard Tab
1558
- with gr.Tab("📊 Leaderboard"):
1559
- gr.Markdown(f"*All statistics are based on PRs from the last {total_months} months*")
1560
-
1561
  leaderboard_table = Leaderboard(
1562
- value=get_leaderboard_dataframe(),
1563
  datatype=LEADERBOARD_COLUMNS,
1564
  search_columns=["Agent Name", "Website"],
1565
  filter_columns=[
@@ -1574,16 +716,30 @@ with gr.Blocks(title="SWE Agent PR Leaderboard", theme=gr.themes.Soft()) as app:
1574
  ]
1575
  )
1576
 
1577
- gr.Markdown("### Monthly Metrics - Top 5 Agents")
1578
- gr.Markdown("Track acceptance rates and PR activity over time for the most active agents")
 
 
 
 
 
 
 
 
 
1579
 
1580
- monthly_plot = gr.Plot(
1581
- value=create_monthly_metrics_plot(),
1582
- label="Monthly PR Metrics"
 
 
 
 
1583
  )
1584
 
 
1585
  # Submit Agent Tab
1586
- with gr.Tab("Submit Agent"):
1587
 
1588
  gr.Markdown("### Submit Your Agent")
1589
  gr.Markdown("Fill in the details below to add your agent to the leaderboard.")
@@ -1592,7 +748,7 @@ with gr.Blocks(title="SWE Agent PR Leaderboard", theme=gr.themes.Soft()) as app:
1592
  with gr.Column():
1593
  github_input = gr.Textbox(
1594
  label="GitHub Identifier*",
1595
- placeholder="Your agent username (e.g., my-agent-bot)"
1596
  )
1597
  name_input = gr.Textbox(
1598
  label="Agent Name*",
@@ -1604,11 +760,6 @@ with gr.Blocks(title="SWE Agent PR Leaderboard", theme=gr.themes.Soft()) as app:
1604
  label="Organization*",
1605
  placeholder="Your organization or team name"
1606
  )
1607
- description_input = gr.Textbox(
1608
- label="Description",
1609
- placeholder="Brief description of your agent",
1610
- lines=3
1611
- )
1612
  website_input = gr.Textbox(
1613
  label="Website*",
1614
  placeholder="https://your-agent-website.com"
@@ -1626,8 +777,8 @@ with gr.Blocks(title="SWE Agent PR Leaderboard", theme=gr.themes.Soft()) as app:
1626
  # Event handler
1627
  submit_button.click(
1628
  fn=submit_agent,
1629
- inputs=[github_input, name_input, organization_input, description_input, website_input],
1630
- outputs=[submission_status, leaderboard_table, monthly_plot]
1631
  )
1632
 
1633
 
 
3
  import json
4
  import os
5
  import time
 
6
  import requests
 
 
7
  from huggingface_hub import HfApi, hf_hub_download
8
  from huggingface_hub.errors import HfHubHTTPError
9
+ import backoff
10
  from dotenv import load_dotenv
11
  import pandas as pd
 
12
  import random
13
  import plotly.graph_objects as go
14
  from plotly.subplots import make_subplots
15
  from apscheduler.schedulers.background import BackgroundScheduler
16
  from apscheduler.triggers.cron import CronTrigger
 
17
 
18
  # Load environment variables
19
  load_dotenv()
 
23
  # =============================================================================
24
 
25
  AGENTS_REPO = "SWE-Arena/bot_metadata" # HuggingFace dataset for agent metadata
26
+ LEADERBOARD_REPO = "SWE-Arena/leaderboard_metadata" # HuggingFace dataset for leaderboard data
 
 
 
27
 
28
  LEADERBOARD_COLUMNS = [
29
  ("Agent Name", "string"),
 
33
  ("Acceptance Rate (%)", "number"),
34
  ]
35
 
 
 
 
36
  # =============================================================================
37
+ # HUGGINGFACE API WRAPPERS WITH BACKOFF
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
  # =============================================================================
39
 
40
  def is_rate_limit_error(e):
 
44
  return False
45
 
46
 
 
 
 
 
 
 
 
 
47
  @backoff.on_exception(
48
  backoff.expo,
49
  HfHubHTTPError,
 
50
  max_tries=8,
51
+ base=300,
52
+ max_value=3600,
53
+ giveup=lambda e: not is_rate_limit_error(e),
54
+ on_backoff=lambda details: print(
55
+ f"Rate limited. Retrying in {details['wait']/60:.1f} minutes ({details['wait']:.0f}s) - attempt {details['tries']}/8..."
56
+ )
57
  )
58
  def list_repo_files_with_backoff(api, **kwargs):
59
+ """Wrapper for api.list_repo_files() with exponential backoff for rate limits."""
60
  return api.list_repo_files(**kwargs)
61
 
62
 
63
  @backoff.on_exception(
64
  backoff.expo,
65
  HfHubHTTPError,
 
66
  max_tries=8,
67
+ base=300,
68
+ max_value=3600,
69
+ giveup=lambda e: not is_rate_limit_error(e),
70
+ on_backoff=lambda details: print(
71
+ f"Rate limited. Retrying in {details['wait']/60:.1f} minutes ({details['wait']:.0f}s) - attempt {details['tries']}/8..."
72
+ )
73
  )
74
  def hf_hub_download_with_backoff(**kwargs):
75
+ """Wrapper for hf_hub_download() with exponential backoff for rate limits."""
76
  return hf_hub_download(**kwargs)
77
 
78
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
  # =============================================================================
80
+ # GITHUB API OPERATIONS
81
  # =============================================================================
82
 
83
+ def request_with_backoff(method, url, *, headers=None, params=None, json_body=None, data=None, max_retries=10, timeout=30):
84
  """
85
+ Perform an HTTP request with exponential backoff and jitter for GitHub API.
86
+ Retries on 403/429 (rate limits), 5xx server errors, and transient network exceptions.
87
 
88
+ Returns the final requests.Response on success or non-retryable status, or None after exhausting retries.
 
 
 
 
 
 
 
 
 
89
  """
90
+ delay = 1.0
91
+ for attempt in range(max_retries):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92
  try:
93
+ resp = requests.request(
94
+ method,
95
+ url,
96
+ headers=headers or {},
97
+ params=params,
98
+ json=json_body,
99
+ data=data,
100
+ timeout=timeout
101
  )
102
 
103
+ status = resp.status_code
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
 
105
+ # Success
106
+ if 200 <= status < 300:
107
+ return resp
108
 
109
+ # Rate limits or server errors -> retry with backoff
110
+ if status in (403, 429) or 500 <= status < 600:
111
+ wait = None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
112
 
113
+ # Prefer Retry-After when present
114
+ retry_after = resp.headers.get('Retry-After') or resp.headers.get('retry-after')
115
+ if retry_after:
116
+ try:
117
+ wait = float(retry_after)
118
+ except Exception:
119
+ wait = None
 
 
 
 
120
 
121
+ # Fallback to X-RateLimit-Reset when 403/429
122
+ if wait is None and status in (403, 429):
123
+ reset_hdr = resp.headers.get('X-RateLimit-Reset') or resp.headers.get('x-ratelimit-reset')
124
+ if reset_hdr:
125
+ try:
126
+ reset_timestamp = int(float(reset_hdr))
127
+ wait = max(reset_timestamp - time.time() + 2, 1)
128
+ except Exception:
129
+ wait = None
130
 
131
+ # Final fallback: exponential backoff with jitter
132
+ if wait is None:
133
+ wait = delay + random.uniform(0, 0.5)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
134
 
135
+ # Cap individual wait to avoid extreme sleeps
136
+ wait = max(1.0, min(wait, 120.0))
137
+ print(f"GitHub API {status}. Backing off {wait:.1f}s (attempt {attempt + 1}/{max_retries})...")
138
+ time.sleep(wait)
139
+ delay = min(delay * 2, 60.0)
140
+ continue
141
 
142
+ # Non-retryable error; return response for caller to handle
143
+ return resp
144
 
145
+ except requests.RequestException as e:
146
+ # Network error -> retry with backoff
147
+ wait = delay + random.uniform(0, 0.5)
148
+ wait = max(1.0, min(wait, 60.0))
149
+ print(f"Request error: {e}. Retrying in {wait:.1f}s (attempt {attempt + 1}/{max_retries})...")
150
+ time.sleep(wait)
151
+ delay = min(delay * 2, 60.0)
152
 
153
+ print(f"Exceeded max retries for {url}")
154
+ return None
 
 
 
 
155
 
156
 
157
  def validate_github_username(identifier):
158
+ """Verify that a GitHub identifier exists with backoff-aware requests."""
159
  try:
 
 
160
  url = f'https://api.github.com/users/{identifier}'
161
+ response = request_with_backoff('GET', url, max_retries=1)
162
+ if response is None:
163
+ return False, "Validation error: network/rate limit exhausted"
164
  if response.status_code == 200:
165
  return True, "Username is valid"
166
  elif response.status_code == 404:
 
171
  return False, f"Validation error: {str(e)}"
172
 
173
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
174
  # =============================================================================
175
  # HUGGINGFACE DATASET OPERATIONS
176
  # =============================================================================
 
182
  agents = []
183
 
184
  # List all files in the repository
185
+ files = list_repo_files_with_backoff(api=api, repo_id=AGENTS_REPO, repo_type="dataset")
186
 
187
  # Filter for JSON files only
188
  json_files = [f for f in files if f.endswith('.json')]
189
 
 
 
190
  # Download and parse each JSON file
191
  for json_file in json_files:
192
  try:
 
203
  if agent_data.get('status') != 'public':
204
  continue
205
 
206
+ # Extract github_identifier from filename (e.g., "agent[bot].json" -> "agent[bot]")
207
+ filename_identifier = json_file.replace('.json', '')
208
+
209
+ # Add or override github_identifier to match filename
210
+ agent_data['github_identifier'] = filename_identifier
211
 
212
  agents.append(agent_data)
213
 
 
215
  print(f"Warning: Could not load {json_file}: {str(e)}")
216
  continue
217
 
218
+ print(f"Loaded {len(agents)} agents from HuggingFace")
219
  return agents
220
 
221
  except Exception as e:
 
231
  return token
232
 
233
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
234
  def upload_with_retry(api, path_or_fileobj, path_in_repo, repo_id, repo_type, token, max_retries=5):
235
  """
236
  Upload file to HuggingFace with exponential backoff retry logic.
 
259
  token=token
260
  )
261
  if attempt > 0:
262
+ print(f" Upload succeeded on attempt {attempt + 1}/{max_retries}")
263
  return True
264
 
265
  except Exception as e:
266
  if attempt < max_retries - 1:
267
  wait_time = delay + random.uniform(0, 1.0)
268
+ print(f" Upload failed (attempt {attempt + 1}/{max_retries}): {str(e)}")
269
+ print(f" Retrying in {wait_time:.1f} seconds...")
270
  time.sleep(wait_time)
271
  delay = min(delay * 2, 60.0) # Exponential backoff, max 60s
272
  else:
273
+ print(f" Upload failed after {max_retries} attempts: {str(e)}")
274
  raise
275
 
276
 
 
300
  repo_type="dataset",
301
  token=token
302
  )
303
+ print(f"Saved agent to HuggingFace: {filename}")
304
  return True
305
  finally:
306
  # Always clean up local file, even if upload fails
 
308
  os.remove(filename)
309
 
310
  except Exception as e:
311
+ print(f"Error saving agent: {str(e)}")
312
  return False
313
 
314
 
315
+ def load_leaderboard_data_from_hf():
316
  """
317
+ Load leaderboard data and monthly metrics from HuggingFace dataset.
 
318
 
319
  Returns:
320
+ dict: Dictionary with 'leaderboard', 'monthly_metrics', and 'metadata' keys
321
+ Returns None if file doesn't exist or error occurs
322
  """
 
 
323
  try:
324
  token = get_hf_token()
325
+ filename = "swe-pr.json"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
326
 
327
+ # Download file
328
+ file_path = hf_hub_download_with_backoff(
 
 
 
 
329
  repo_id=LEADERBOARD_REPO,
330
+ filename=filename,
331
  repo_type="dataset",
332
+ token=token
 
333
  )
334
 
335
+ # Load JSON data
336
+ with open(file_path, 'r') as f:
337
+ data = json.load(f)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
338
 
339
+ last_updated = data.get('metadata', {}).get('last_updated', 'Unknown')
340
+ print(f"Loaded leaderboard data from HuggingFace (last updated: {last_updated})")
 
341
 
342
+ return data
 
 
 
 
 
343
 
344
  except Exception as e:
345
+ print(f"Could not load leaderboard data from HuggingFace: {str(e)}")
346
+ return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
347
 
348
 
349
  # =============================================================================
350
  # UI FUNCTIONS
351
  # =============================================================================
352
 
 
 
 
 
 
 
 
 
353
  def create_monthly_metrics_plot(top_n=5):
354
  """
355
  Create a Plotly figure with dual y-axes showing:
356
+ - Left y-axis: Acceptance Rate (%) as line curves
357
  - Right y-axis: Total PRs created as bar charts
358
 
359
  Each agent gets a unique color for both their line and bars.
 
361
  Args:
362
  top_n: Number of top agents to show (default: 5)
363
  """
364
+ # Load from saved dataset
365
+ saved_data = load_leaderboard_data_from_hf()
366
+
367
+ if not saved_data or 'monthly_metrics' not in saved_data:
368
+ # Return an empty figure with a message
369
+ fig = go.Figure()
370
+ fig.add_annotation(
371
+ text="No data available for visualization",
372
+ xref="paper", yref="paper",
373
+ x=0.5, y=0.5, showarrow=False,
374
+ font=dict(size=16)
375
+ )
376
+ fig.update_layout(
377
+ title=None,
378
+ xaxis_title=None,
379
+ height=500
380
+ )
381
+ return fig
382
+
383
+ metrics = saved_data['monthly_metrics']
384
+ print(f"Loaded monthly metrics from saved dataset")
385
+
386
+ # Apply top_n filter if specified
387
+ if top_n is not None and top_n > 0 and metrics.get('agents'):
388
+ # Calculate total PRs for each agent
389
+ agent_totals = []
390
+ for agent_name in metrics['agents']:
391
+ agent_data = metrics['data'].get(agent_name, {})
392
+ total_prs = sum(agent_data.get('total_prs', []))
393
+ agent_totals.append((agent_name, total_prs))
394
+
395
+ # Sort by total PRs and take top N
396
+ agent_totals.sort(key=lambda x: x[1], reverse=True)
397
+ top_agents = [agent_name for agent_name, _ in agent_totals[:top_n]]
398
+
399
+ # Filter metrics to only include top agents
400
+ metrics = {
401
+ 'agents': top_agents,
402
+ 'months': metrics['months'],
403
+ 'data': {agent: metrics['data'][agent] for agent in top_agents if agent in metrics['data']}
404
+ }
405
 
406
  if not metrics['agents'] or not metrics['months']:
407
  # Return an empty figure with a message
 
422
  # Create figure with secondary y-axis
423
  fig = make_subplots(specs=[[{"secondary_y": True}]])
424
 
425
+ # Generate unique colors for many agents using HSL color space
426
+ def generate_color(index, total):
427
+ """Generate distinct colors using HSL color space for better distribution"""
428
+ hue = (index * 360 / total) % 360
429
+ saturation = 70 + (index % 3) * 10 # Vary saturation slightly
430
+ lightness = 45 + (index % 2) * 10 # Vary lightness slightly
431
+ return f'hsl({hue}, {saturation}%, {lightness}%)'
432
+
433
  agents = metrics['agents']
434
  months = metrics['months']
435
  data = metrics['data']
436
 
437
+ # Generate colors for all agents
438
  agent_colors = {agent: generate_color(idx, len(agents)) for idx, agent in enumerate(agents)}
439
 
440
  # Add traces for each agent
 
456
  name=agent_name,
457
  mode='lines+markers',
458
  line=dict(color=color, width=2),
459
+ marker=dict(size=8),
460
  legendgroup=agent_name,
461
+ showlegend=(top_n is not None and top_n <= 10), # Show legend for top N agents
462
+ hovertemplate='<b>Agent: %{fullData.name}</b><br>' +
463
+ 'Month: %{x}<br>' +
464
  'Acceptance Rate: %{y:.2f}%<br>' +
465
  '<extra></extra>'
466
  ),
 
484
  name=agent_name,
485
  marker=dict(color=color, opacity=0.6),
486
  legendgroup=agent_name,
487
+ showlegend=False, # Hide duplicate legend entry (already shown in Scatter)
488
+ hovertemplate='<b>Agent: %{fullData.name}</b><br>' +
489
+ 'Month: %{x}<br>' +
490
  'Total PRs: %{y}<br>' +
491
  '<extra></extra>',
492
  offsetgroup=agent_name # Group bars by agent for proper spacing
 
496
 
497
  # Update axes labels
498
  fig.update_xaxes(title_text=None)
499
+ fig.update_yaxes(
500
+ title_text="<b>Acceptance Rate (%)</b>",
501
+ range=[0, 100],
502
+ secondary_y=False,
503
+ showticklabels=True,
504
+ tickmode='linear',
505
+ dtick=10,
506
+ showgrid=True
507
+ )
508
  fig.update_yaxes(title_text="<b>Total PRs</b>", secondary_y=True)
509
 
510
  # Update layout
511
+ show_legend = (top_n is not None and top_n <= 10)
512
  fig.update_layout(
513
  title=None,
514
+ hovermode='closest', # Show individual agent info on hover
515
  barmode='group',
516
  height=600,
517
+ showlegend=show_legend,
518
+ margin=dict(l=50, r=150 if show_legend else 50, t=50, b=50) # More right margin when legend is shown
 
 
 
 
 
 
519
  )
520
 
521
  return fig
 
523
 
524
  def get_leaderboard_dataframe():
525
  """
526
+ Load leaderboard from saved dataset and convert to pandas DataFrame for display.
 
527
  Returns formatted DataFrame sorted by total PRs.
528
  """
529
+ # Load from saved dataset
530
+ saved_data = load_leaderboard_data_from_hf()
531
+
532
+ if not saved_data or 'leaderboard' not in saved_data:
533
+ print(f"No leaderboard data available")
534
+ # Return empty DataFrame with correct columns if no data
535
+ column_names = [col[0] for col in LEADERBOARD_COLUMNS]
536
+ return pd.DataFrame(columns=column_names)
537
 
538
+ cache_dict = saved_data['leaderboard']
539
+ last_updated = saved_data.get('metadata', {}).get('last_updated', 'Unknown')
540
+ print(f"Loaded leaderboard from saved dataset (last updated: {last_updated})")
541
+ print(f"Cache dict size: {len(cache_dict)}")
 
 
542
 
543
  if not cache_dict:
544
+ print("WARNING: cache_dict is empty!")
545
  # Return empty DataFrame with correct columns if no data
546
  column_names = [col[0] for col in LEADERBOARD_COLUMNS]
547
  return pd.DataFrame(columns=column_names)
548
 
549
  rows = []
550
+ filtered_count = 0
551
  for identifier, data in cache_dict.items():
552
+ total_prs = data.get('total_prs', 0)
553
+ print(f" Agent '{identifier}': {total_prs} PRs")
554
+
555
  # Filter out agents with zero total PRs
556
+ if total_prs == 0:
557
+ filtered_count += 1
558
+ continue
559
+
560
+ # Only include display-relevant fields
561
+ rows.append([
562
+ data.get('name', 'Unknown'),
563
+ data.get('website', 'N/A'),
564
+ total_prs,
565
+ data.get('merged_prs', 0),
566
+ data.get('acceptance_rate', 0.0),
567
+ ])
568
+
569
+ print(f"Filtered out {filtered_count} agents with 0 PRs")
570
+ print(f"Leaderboard will show {len(rows)} agents")
571
 
572
  # Create DataFrame
573
  column_names = [col[0] for col in LEADERBOARD_COLUMNS]
 
583
  if "Total PRs" in df.columns and not df.empty:
584
  df = df.sort_values(by="Total PRs", ascending=False).reset_index(drop=True)
585
 
586
+ print(f"Final DataFrame shape: {df.shape}")
587
+ print("="*60 + "\n")
588
+
589
  return df
590
 
591
 
592
+ def submit_agent(identifier, agent_name, organization, website):
593
  """
594
  Submit a new agent to the leaderboard.
595
  Validates input and saves submission.
 
596
  """
597
  # Validate required fields
598
  if not identifier or not identifier.strip():
599
+ return "ERROR: GitHub identifier is required", get_leaderboard_dataframe()
600
  if not agent_name or not agent_name.strip():
601
+ return "ERROR: Agent name is required", get_leaderboard_dataframe()
602
  if not organization or not organization.strip():
603
+ return "ERROR: Organization name is required", get_leaderboard_dataframe()
604
  if not website or not website.strip():
605
+ return "ERROR: Website URL is required", get_leaderboard_dataframe()
606
 
607
  # Clean inputs
608
  identifier = identifier.strip()
609
  agent_name = agent_name.strip()
610
  organization = organization.strip()
 
611
  website = website.strip()
612
 
613
  # Validate GitHub identifier
614
  is_valid, message = validate_github_username(identifier)
615
  if not is_valid:
616
+ return f"ERROR: {message}", get_leaderboard_dataframe()
617
 
618
  # Check for duplicates by loading agents from HuggingFace
619
  agents = load_agents_from_hf()
620
  if agents:
621
  existing_names = {agent['github_identifier'] for agent in agents}
622
  if identifier in existing_names:
623
+ return f"WARNING: Agent with identifier '{identifier}' already exists", get_leaderboard_dataframe()
624
 
625
  # Create submission
626
  submission = {
627
  'name': agent_name,
628
  'organization': organization,
629
  'github_identifier': identifier,
 
630
  'website': website,
631
+ 'status': 'public'
632
  }
633
 
634
  # Save to HuggingFace
635
  if not save_agent_to_hf(submission):
636
+ return "ERROR: Failed to save submission", get_leaderboard_dataframe()
637
+
638
+ # Return success message - data will be populated by backend updates
639
+ return f"SUCCESS: Successfully submitted {agent_name}! PR data will be populated by the backend system.", get_leaderboard_dataframe()
640
 
641
+
642
+ # =============================================================================
643
+ # DATA RELOAD FUNCTION
644
+ # =============================================================================
645
+
646
+ def reload_leaderboard_data():
647
+ """
648
+ Reload leaderboard data from HuggingFace.
649
+ This function is called by the scheduler on a daily basis.
650
+ """
651
+ print(f"\n{'='*80}")
652
+ print(f"Reloading leaderboard data from HuggingFace...")
653
+ print(f"{'='*80}\n")
654
+
655
+ try:
656
+ data = load_leaderboard_data_from_hf()
657
+ if data:
658
+ print(f"Successfully reloaded leaderboard data")
659
+ print(f" Last updated: {data.get('metadata', {}).get('last_updated', 'Unknown')}")
660
+ print(f" Agents: {len(data.get('leaderboard', {}))}")
661
+ else:
662
+ print(f"No data available")
663
+ except Exception as e:
664
+ print(f"Error reloading leaderboard data: {str(e)}")
665
+
666
+ print(f"{'='*80}\n")
667
 
668
 
669
  # =============================================================================
670
  # GRADIO APPLICATION
671
  # =============================================================================
672
 
673
+ print(f"\nStarting SWE Agent PR Leaderboard")
674
+ print(f" Data source: {LEADERBOARD_REPO}")
675
+ print(f" Reload frequency: Daily at 12:00 AM UTC\n")
676
 
677
+ # Start APScheduler for daily data reload at 12:00 AM UTC
678
  scheduler = BackgroundScheduler(timezone="UTC")
679
  scheduler.add_job(
680
+ reload_leaderboard_data,
681
+ trigger=CronTrigger(hour=0, minute=0), # 12:00 AM UTC daily
682
+ id='daily_data_reload',
683
+ name='Daily Data Reload',
684
  replace_existing=True
685
  )
686
  scheduler.start()
687
  print(f"\n{'='*80}")
688
+ print(f"Scheduler initialized successfully")
689
+ print(f"Reload schedule: Daily at 12:00 AM UTC")
690
+ print(f"On startup: Loads cached data from HuggingFace on demand")
691
  print(f"{'='*80}\n")
692
 
 
 
 
 
 
 
 
 
 
 
 
693
  # Create Gradio interface
694
  with gr.Blocks(title="SWE Agent PR Leaderboard", theme=gr.themes.Soft()) as app:
695
+ gr.Markdown("# SWE Agent PR Leaderboard")
 
 
696
  gr.Markdown(f"Track and compare GitHub pull request statistics for SWE agents")
697
 
698
  with gr.Tabs():
699
 
700
  # Leaderboard Tab
701
+ with gr.Tab("Leaderboard"):
702
+ gr.Markdown("*Statistics are based on agent PR activity tracked by the system*")
 
703
  leaderboard_table = Leaderboard(
704
+ value=pd.DataFrame(columns=[col[0] for col in LEADERBOARD_COLUMNS]), # Empty initially
705
  datatype=LEADERBOARD_COLUMNS,
706
  search_columns=["Agent Name", "Website"],
707
  filter_columns=[
 
716
  ]
717
  )
718
 
719
+ # Load leaderboard data when app starts
720
+ app.load(
721
+ fn=get_leaderboard_dataframe,
722
+ inputs=[],
723
+ outputs=[leaderboard_table]
724
+ )
725
+
726
+ # Monthly Metrics Section
727
+ gr.Markdown("---") # Divider
728
+ gr.Markdown("### Monthly Performance - Top 5 Agents")
729
+ gr.Markdown("*Shows acceptance rate trends and PR volumes for the most active agents*")
730
 
731
+ monthly_metrics_plot = gr.Plot(label="Monthly Metrics")
732
+
733
+ # Load monthly metrics when app starts
734
+ app.load(
735
+ fn=lambda: create_monthly_metrics_plot(),
736
+ inputs=[],
737
+ outputs=[monthly_metrics_plot]
738
  )
739
 
740
+
741
  # Submit Agent Tab
742
+ with gr.Tab("Submit Agent"):
743
 
744
  gr.Markdown("### Submit Your Agent")
745
  gr.Markdown("Fill in the details below to add your agent to the leaderboard.")
 
748
  with gr.Column():
749
  github_input = gr.Textbox(
750
  label="GitHub Identifier*",
751
+ placeholder="Your agent username (e.g., my-agent[bot])"
752
  )
753
  name_input = gr.Textbox(
754
  label="Agent Name*",
 
760
  label="Organization*",
761
  placeholder="Your organization or team name"
762
  )
 
 
 
 
 
763
  website_input = gr.Textbox(
764
  label="Website*",
765
  placeholder="https://your-agent-website.com"
 
777
  # Event handler
778
  submit_button.click(
779
  fn=submit_agent,
780
+ inputs=[github_input, name_input, organization_input, website_input],
781
+ outputs=[submission_status, leaderboard_table]
782
  )
783
 
784
 
docker-compose.yml ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ services:
2
+ msr-miner:
3
+ build:
4
+ context: .
5
+ dockerfile: Dockerfile
6
+ container_name: gharchive-miner
7
+ restart: unless-stopped
8
+ env_file:
9
+ - .env
10
+ volumes:
11
+ # Mount entire workspace for live code updates
12
+ - .:/app
13
+ # Mount gharchive workspace for data storage
14
+ - ../gharchive:/gharchive:ro
15
+ environment:
16
+ - PYTHONUNBUFFERED=1
17
+ logging:
18
+ driver: "json-file"
19
+ options:
20
+ max-size: "10m"
21
+ max-file: "3"
msr.py CHANGED
@@ -1,18 +1,25 @@
1
  """
2
  Minimalist PR Metadata Mining Script
3
- Mines PR metadata from GitHub Archive via BigQuery and saves to HuggingFace dataset.
4
  """
5
 
6
  import json
7
  import os
 
8
  import tempfile
9
  from datetime import datetime, timezone, timedelta
10
  from collections import defaultdict
 
11
  from huggingface_hub import HfApi, hf_hub_download
12
  from huggingface_hub.errors import HfHubHTTPError
13
  from dotenv import load_dotenv
14
- from google.cloud import bigquery
15
  import backoff
 
 
 
 
 
16
 
17
  # Load environment variables
18
  load_dotenv()
@@ -23,8 +30,27 @@ load_dotenv()
23
 
24
  AGENTS_REPO = "SWE-Arena/bot_metadata"
25
  PR_METADATA_REPO = "SWE-Arena/pr_metadata"
26
- LEADERBOARD_REPO = "SWE-Arena/leaderboard_metadata" # For storing computed leaderboard data
27
- LEADERBOARD_TIME_FRAME_DAYS = 180 # Time frame for mining new PRs
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
 
29
  # =============================================================================
30
  # UTILITY FUNCTIONS
@@ -54,246 +80,329 @@ def save_jsonl(filename, data):
54
  f.write(json.dumps(item) + '\n')
55
 
56
 
57
- def get_bigquery_client():
58
  """
59
- Initialize BigQuery client using credentials from environment variable.
60
-
61
- Expects GOOGLE_APPLICATION_CREDENTIALS_JSON environment variable containing
62
- the service account JSON credentials as a string.
 
63
  """
64
- # Get the JSON content from environment variable
65
- creds_json = os.environ.get('GOOGLE_APPLICATION_CREDENTIALS_JSON')
66
 
67
- if creds_json:
68
- # Create a temporary file to store credentials
69
- with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.json') as temp_file:
70
- temp_file.write(creds_json)
71
- temp_path = temp_file.name
72
 
73
- # Set environment variable to point to temp file
74
- os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = temp_path
75
 
76
- # Initialize BigQuery client
77
- client = bigquery.Client()
 
 
 
78
 
79
- # Clean up temp file
80
- os.unlink(temp_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
 
82
- return client
83
- else:
84
- raise ValueError("GOOGLE_APPLICATION_CREDENTIALS_JSON not found in environment")
85
 
 
 
 
86
 
87
- def generate_table_union_statements(start_date, end_date):
88
  """
89
- Generate UNION ALL statements for githubarchive.month tables in date range.
90
 
91
  Args:
92
- start_date: Start datetime
93
- end_date: End datetime
94
 
95
  Returns:
96
- String with UNION ALL SELECT statements for all tables in range
97
  """
98
- table_names = []
 
99
 
100
- # Start from the beginning of start_date's month
101
- current_date = start_date.replace(day=1)
102
- end_month = end_date.replace(day=1)
103
 
104
- while current_date <= end_month:
105
- table_name = f"`githubarchive.month.{current_date.strftime('%Y%m')}`"
106
- table_names.append(table_name)
 
 
 
 
 
107
 
108
- # Move to next month
109
- if current_date.month == 12:
110
- current_date = current_date.replace(year=current_date.year + 1, month=1)
111
- else:
112
- current_date = current_date.replace(month=current_date.month + 1)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
113
 
114
- # Create UNION ALL chain
115
- union_parts = [f"SELECT * FROM {table}" for table in table_names]
116
- return " UNION ALL ".join(union_parts)
117
 
118
 
119
- def get_hf_token():
120
- """Get HuggingFace token from environment variables."""
121
- token = os.getenv('HF_TOKEN')
122
- if not token:
123
- print("Warning: HF_TOKEN not found in environment variables")
124
- return token
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
125
 
126
 
127
  # =============================================================================
128
- # HUGGINGFACE API RETRY WRAPPERS
129
  # =============================================================================
130
 
131
- def is_rate_limit_error(e):
132
- """Check if exception is a HuggingFace rate limit error (429)."""
 
 
 
133
  if isinstance(e, HfHubHTTPError):
134
- return e.response.status_code == 429
135
- return False
136
 
 
 
 
 
 
137
 
138
- def backoff_handler(details):
139
- """Handler to print retry attempt information."""
140
- wait_time = details['wait']
141
- tries = details['tries']
142
- wait_minutes = wait_time / 60
143
- print(f" ⏳ Rate limited. Retrying in {wait_minutes:.1f} minutes ({wait_time:.0f}s) - attempt {tries}/8...")
 
144
 
145
 
146
  @backoff.on_exception(
147
  backoff.expo,
148
- HfHubHTTPError,
149
- giveup=lambda e: not is_rate_limit_error(e),
150
  max_tries=8,
151
- base=300, # Start at 5 minutes (300 seconds)
152
- max_value=3600, # Cap at 60 minutes (3600 seconds)
153
- jitter=backoff.full_jitter,
154
- on_backoff=backoff_handler
 
 
155
  )
156
  def list_repo_files_with_backoff(api, **kwargs):
157
- """Wrapper for HfApi.list_repo_files with exponential backoff on rate limits."""
158
  return api.list_repo_files(**kwargs)
159
 
160
 
161
  @backoff.on_exception(
162
  backoff.expo,
163
- HfHubHTTPError,
164
- giveup=lambda e: not is_rate_limit_error(e),
165
  max_tries=8,
166
- base=300, # Start at 5 minutes (300 seconds)
167
- max_value=3600, # Cap at 60 minutes (3600 seconds)
168
- jitter=backoff.full_jitter,
169
- on_backoff=backoff_handler
 
 
170
  )
171
  def hf_hub_download_with_backoff(**kwargs):
172
- """Wrapper for hf_hub_download with exponential backoff on rate limits."""
173
  return hf_hub_download(**kwargs)
174
 
175
 
176
  @backoff.on_exception(
177
  backoff.expo,
178
- HfHubHTTPError,
179
- giveup=lambda e: not is_rate_limit_error(e),
180
  max_tries=8,
181
- base=300, # Start at 5 minutes (300 seconds)
182
- max_value=3600, # Cap at 60 minutes (3600 seconds)
183
- jitter=backoff.full_jitter,
184
- on_backoff=backoff_handler
 
 
185
  )
186
- def upload_folder_with_backoff(api, **kwargs):
187
- """Wrapper for HfApi.upload_folder with exponential backoff on rate limits."""
188
- return api.upload_folder(**kwargs)
189
 
190
 
191
  @backoff.on_exception(
192
  backoff.expo,
193
- HfHubHTTPError,
194
- giveup=lambda e: not is_rate_limit_error(e),
195
  max_tries=8,
196
- base=300, # Start at 5 minutes (300 seconds)
197
- max_value=3600, # Cap at 60 minutes (3600 seconds)
198
- jitter=backoff.full_jitter,
199
- on_backoff=backoff_handler
 
 
200
  )
201
- def upload_file_with_backoff(api, **kwargs):
202
- """Wrapper for HfApi.upload_file with exponential backoff on rate limits."""
203
- return api.upload_file(**kwargs)
204
-
205
 
206
- # =============================================================================
207
- # BIGQUERY FUNCTIONS
208
- # =============================================================================
209
 
210
- def fetch_issue_metadata_batched(client, identifiers, start_date, end_date, batch_size=100, upload_immediately=True):
211
  """
212
- Fetch issue metadata for ALL agents using BATCHED BigQuery queries.
213
- Splits agents into smaller batches to avoid performance issues with large numbers of agents.
214
-
215
- Args:
216
- client: BigQuery client instance
217
- identifiers: List of GitHub usernames/bot identifiers
218
- start_date: Start datetime (timezone-aware)
219
- end_date: End datetime (timezone-aware)
220
- batch_size: Number of agents to process per batch (default: 100)
221
- upload_immediately: If True, upload each batch's results to HuggingFace immediately (default: True)
222
 
223
  Returns:
224
- Dictionary mapping agent identifier to list of issue metadata
225
  """
226
- # Split identifiers into batches
227
- batches = [identifiers[i:i + batch_size] for i in range(0, len(identifiers), batch_size)]
228
- total_batches = len(batches)
229
 
230
- print(f"\n🔍 Using BATCHED approach for {len(identifiers)} agents")
231
- print(f" Total batches: {total_batches} (batch size: {batch_size})")
232
- print(f" Time range: {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}")
233
- if upload_immediately:
234
- print(f" Upload mode: Immediate (after each batch)")
235
- else:
236
- print(f" Upload mode: Deferred (all at once)")
237
 
238
- # Collect results from all batches
239
- all_metadata = {}
240
 
241
- for batch_num, batch_identifiers in enumerate(batches, 1):
242
- print(f"\n📦 Processing batch {batch_num}/{total_batches} ({len(batch_identifiers)} agents)...")
243
 
244
- try:
245
- # Query each batch
246
- batch_results = fetch_all_pr_metadata_single_query(
247
- client, batch_identifiers, start_date, end_date
248
- )
249
 
250
- # Merge results
251
- for identifier, metadata_list in batch_results.items():
252
- if identifier in all_metadata:
253
- all_metadata[identifier].extend(metadata_list)
254
- else:
255
- all_metadata[identifier] = metadata_list
256
 
257
- print(f" ✓ Batch {batch_num}/{total_batches} complete")
 
 
 
 
258
 
259
- # Upload immediately after this batch if enabled
260
- if upload_immediately and batch_results:
261
- print(f"\n 📤 Uploading batch {batch_num}/{total_batches} results to HuggingFace...")
262
- upload_success = 0
263
- upload_errors = 0
264
 
265
- for identifier, metadata_list in batch_results.items():
266
- if metadata_list:
267
- if save_pr_metadata_to_hf(metadata_list, identifier):
268
- upload_success += 1
269
- else:
270
- upload_errors += 1
 
 
 
271
 
272
- print(f" ✓ Batch {batch_num}/{total_batches} upload complete ({upload_success} agents uploaded, {upload_errors} errors)")
 
 
273
 
274
- except Exception as e:
275
- print(f" ✗ Batch {batch_num}/{total_batches} failed: {str(e)}")
276
- print(f" Continuing with remaining batches...")
277
- continue
278
 
279
- total_prs = sum(len(metadata_list) for metadata_list in all_metadata.values())
280
- print(f"\n✓ All batches complete! Found {total_prs} total PRs across {len(all_metadata)} agents")
 
281
 
282
- return all_metadata
283
 
284
 
285
- def fetch_all_pr_metadata_single_query(client, identifiers, start_date, end_date):
286
- """
287
- Fetch PR metadata for a BATCH of agents using ONE comprehensive BigQuery query.
288
 
289
- NOTE: This function is designed for smaller batches (~100 agents).
290
- For large numbers of agents, use fetch_issue_metadata_batched() instead.
 
291
 
292
  This query fetches:
293
  1. PRs authored by agents (user.login matches identifier)
 
294
 
295
  Args:
296
- client: BigQuery client instance
297
  identifiers: List of GitHub usernames/bot identifiers
298
  start_date: Start datetime (timezone-aware)
299
  end_date: End datetime (timezone-aware)
@@ -303,7 +412,7 @@ def fetch_all_pr_metadata_single_query(client, identifiers, start_date, end_date
303
  {
304
  'agent-identifier': [
305
  {
306
- 'url': PR URL,
307
  'created_at': Creation timestamp,
308
  'merged_at': Merge timestamp (if merged, else None),
309
  'closed_at': Close timestamp (if closed but not merged, else None)
@@ -313,35 +422,30 @@ def fetch_all_pr_metadata_single_query(client, identifiers, start_date, end_date
313
  ...
314
  }
315
  """
316
- print(f" Querying BigQuery for {len(identifiers)} agents in this batch...")
317
- print(f" Time range: {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}")
318
 
319
- # Generate table UNION statements for the time range
320
- table_union = generate_table_union_statements(start_date, end_date)
321
 
322
- # Build identifier list for SQL IN clause (author matching only)
323
- author_list = ', '.join([f"'{id}'" for id in identifiers])
324
-
325
- # Build comprehensive query with CTE
326
  query = f"""
327
  WITH pr_events AS (
328
  -- Get all PR events (opened, closed) for all agents
329
  SELECT
330
- JSON_EXTRACT_SCALAR(payload, '$.pull_request.html_url') as url,
331
- JSON_EXTRACT_SCALAR(payload, '$.pull_request.user.login') as pr_author,
332
- JSON_EXTRACT_SCALAR(payload, '$.pull_request.created_at') as created_at,
333
- CAST(JSON_EXTRACT_SCALAR(payload, '$.pull_request.merged') AS BOOL) as is_merged,
334
- JSON_EXTRACT_SCALAR(payload, '$.pull_request.merged_at') as merged_at,
335
- JSON_EXTRACT_SCALAR(payload, '$.pull_request.closed_at') as closed_at,
336
- JSON_EXTRACT_SCALAR(payload, '$.action') as action,
337
  created_at as event_time
338
- FROM (
339
- {table_union}
340
- ) t
341
  WHERE
342
- type = 'PullRequestEvent'
343
- AND JSON_EXTRACT_SCALAR(payload, '$.pull_request.html_url') IS NOT NULL
344
- AND JSON_EXTRACT_SCALAR(payload, '$.pull_request.user.login') IN ({author_list})
345
  ),
346
 
347
  pr_latest_state AS (
@@ -368,72 +472,77 @@ def fetch_all_pr_metadata_single_query(client, identifiers, start_date, end_date
368
  ORDER BY created_at DESC
369
  """
370
 
371
- print(f" Scanning {(end_date - start_date).days} days of GitHub Archive data...")
372
- print(f" Batch agents: {', '.join(identifiers[:5])}{'...' if len(identifiers) > 5 else ''}")
373
-
374
  try:
375
- query_job = client.query(query)
376
- results = list(query_job.result())
377
-
378
- print(f" ✓ Found {len(results)} PRs in this batch")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
379
 
380
  # Group results by agent
381
  metadata_by_agent = defaultdict(list)
382
 
383
  for row in results:
384
- # Convert datetime objects to ISO strings
385
- created_at = row.created_at
386
- if hasattr(created_at, 'isoformat'):
387
- created_at = created_at.isoformat()
388
-
389
- merged_at = row.merged_at
390
- if hasattr(merged_at, 'isoformat'):
391
- merged_at = merged_at.isoformat()
392
-
393
- closed_at = row.closed_at
394
- if hasattr(closed_at, 'isoformat'):
395
- closed_at = closed_at.isoformat()
396
-
397
- pr_data = {
398
- 'html_url': row.url,
399
  'created_at': created_at,
400
  'merged_at': merged_at,
401
  'closed_at': closed_at,
402
- }
403
-
404
- # Assign to agent based on author
405
- pr_author = row.pr_author
406
- if pr_author and pr_author in identifiers:
407
- metadata_by_agent[pr_author].append(pr_data)
408
-
409
- # Print breakdown by agent (only show agents with PRs)
410
- print(f" 📊 Batch breakdown:")
411
- for identifier in identifiers:
412
- count = len(metadata_by_agent.get(identifier, []))
413
- if count > 0:
414
- metadata = metadata_by_agent[identifier]
415
- merged_count = sum(1 for m in metadata if m['merged_at'] is not None)
416
- closed_count = sum(1 for m in metadata if m['closed_at'] is not None and m['merged_at'] is None)
417
- open_count = count - merged_count - closed_count
418
- print(f" {identifier}: {count} PRs ({merged_count} merged, {closed_count} closed, {open_count} open)")
419
 
420
  # Convert defaultdict to regular dict
421
  return dict(metadata_by_agent)
422
 
423
  except Exception as e:
424
- print(f" BigQuery error: {str(e)}")
425
  import traceback
426
  traceback.print_exc()
427
  return {}
428
 
429
 
430
  # =============================================================================
431
- # HUGGINGFACE STORAGE FUNCTIONS
432
  # =============================================================================
433
 
434
  def group_metadata_by_date(metadata_list):
435
  """
436
- Group PR metadata by exact date (year.month.day) for efficient daily storage.
437
  Returns dict: {(year, month, day): [metadata_list]}
438
  """
439
  grouped = defaultdict(list)
@@ -453,20 +562,56 @@ def group_metadata_by_date(metadata_list):
453
  return dict(grouped)
454
 
455
 
456
- def save_pr_metadata_to_hf(metadata_list, agent_identifier):
457
  """
458
- Save PR metadata to HuggingFace dataset, organized by [agent_identifier]/YYYY.MM.DD.jsonl.
459
- Each file is stored in the agent's folder and named YYYY.MM.DD.jsonl for that day's PRs.
460
-
461
- This function OVERWRITES existing files completely with fresh data from BigQuery.
462
- Uses batch upload to avoid rate limit (uploads entire folder in single operation).
463
 
464
  Args:
465
- metadata_list: List of PR metadata dictionaries
466
- agent_identifier: GitHub identifier of the agent (used as folder name)
 
 
 
 
 
 
 
 
467
  """
468
- import shutil
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
469
 
 
 
 
470
  try:
471
  token = get_hf_token()
472
  if not token:
@@ -474,56 +619,89 @@ def save_pr_metadata_to_hf(metadata_list, agent_identifier):
474
 
475
  api = HfApi(token=token)
476
 
477
- # Group by date (year, month, day)
478
- grouped = group_metadata_by_date(metadata_list)
 
479
 
480
- if not grouped:
481
- print(f" No valid metadata to save for {agent_identifier}")
482
- return False
 
 
483
 
484
- # Create a temporary directory for batch upload
485
- temp_dir = tempfile.mkdtemp()
486
- agent_folder = os.path.join(temp_dir, agent_identifier)
487
- os.makedirs(agent_folder, exist_ok=True)
488
 
489
- try:
490
- print(f" 📦 Preparing batch upload for {len(grouped)} daily files...")
491
-
492
- # Process each daily file
493
- for (pr_year, month, day), day_metadata in grouped.items():
494
- filename = f"{agent_identifier}/{pr_year}.{month:02d}.{day:02d}.jsonl"
495
- local_filename = os.path.join(agent_folder, f"{pr_year}.{month:02d}.{day:02d}.jsonl")
496
-
497
- # Sort by created_at for better organization
498
- day_metadata.sort(key=lambda x: x.get('created_at', ''), reverse=True)
499
-
500
- # Save to temp directory (complete overwrite, no merging)
501
- save_jsonl(local_filename, day_metadata)
502
- print(f" Prepared {len(day_metadata)} PRs for {filename}")
503
-
504
- # Upload entire folder using upload_folder (single commit per agent)
505
- print(f" 📤 Uploading {len(grouped)} files ({len(metadata_list)} total PRs)...")
506
- upload_folder_with_backoff(
507
- api,
508
- folder_path=temp_dir,
509
- repo_id=PR_METADATA_REPO,
510
- repo_type="dataset",
511
- commit_message=f"Update PR metadata for {agent_identifier}"
512
- )
513
- print(f" ✓ Batch upload complete for {agent_identifier}")
514
 
515
- return True
 
 
516
 
517
- finally:
518
- # Always clean up temp directory
519
- if os.path.exists(temp_dir):
520
- shutil.rmtree(temp_dir)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
521
 
522
  except Exception as e:
523
- print(f"Error saving PR metadata: {str(e)}")
524
  import traceback
525
  traceback.print_exc()
526
- return False
527
 
528
 
529
  def load_agents_from_hf():
@@ -537,13 +715,11 @@ def load_agents_from_hf():
537
  agents = []
538
 
539
  # List all files in the repository
540
- files = list_repo_files_with_backoff(api, repo_id=AGENTS_REPO, repo_type="dataset")
541
 
542
  # Filter for JSON files only
543
  json_files = [f for f in files if f.endswith('.json')]
544
 
545
- print(f"Found {len(json_files)} agent files in {AGENTS_REPO}")
546
-
547
  # Download and parse each JSON file
548
  for json_file in json_files:
549
  try:
@@ -567,10 +743,11 @@ def load_agents_from_hf():
567
  agents.append(agent_data)
568
 
569
  except Exception as e:
570
- print(f"Warning: Could not load {json_file}: {str(e)}")
571
  continue
572
 
573
- print(f" Loaded {len(agents)} agents from HuggingFace")
 
574
  return agents
575
 
576
  except Exception as e:
@@ -609,46 +786,54 @@ def calculate_pr_stats_from_metadata(metadata_list):
609
  }
610
 
611
 
612
- def calculate_monthly_metrics(all_metadata, agents):
613
  """
614
  Calculate monthly metrics for all agents for visualization.
615
 
616
  Args:
617
- all_metadata: List of all PR metadata with agent_identifier field
618
- agents: List of agent data dictionaries
619
 
620
  Returns:
621
- dict with monthly metrics organized by agent
 
 
 
 
 
 
 
 
 
 
622
  """
623
- from datetime import datetime, timezone
624
-
625
  # Create mapping from agent_identifier to agent_name
626
- identifier_to_name = {
627
- agent.get('github_identifier'): agent.get('name', 'Unknown')
628
- for agent in agents
629
- if agent.get('github_identifier')
630
- }
631
 
632
  # Group by agent and month
633
  agent_month_data = defaultdict(lambda: defaultdict(list))
634
 
635
- for pr_meta in all_metadata:
636
- agent_identifier = pr_meta.get('agent_identifier')
637
- created_at = pr_meta.get('created_at')
 
638
 
639
- if not agent_identifier or not created_at:
640
- continue
641
 
642
- # Get agent_name from identifier
643
- agent_name = identifier_to_name.get(agent_identifier, agent_identifier)
644
 
645
- try:
646
- dt = datetime.fromisoformat(created_at.replace('Z', '+00:00'))
647
- month_key = f"{dt.year}-{dt.month:02d}"
648
- agent_month_data[agent_name][month_key].append(pr_meta)
649
- except Exception as e:
650
- print(f"Warning: Could not parse date '{created_at}': {e}")
651
- continue
652
 
653
  # Get all unique months and sort them
654
  all_months = set()
@@ -660,8 +845,8 @@ def calculate_monthly_metrics(all_metadata, agents):
660
  result_data = {}
661
  for agent_name, month_dict in agent_month_data.items():
662
  acceptance_rates = []
663
- total_prs = []
664
- merged_prs = []
665
  closed_not_merged_list = []
666
 
667
  for month in months:
@@ -682,14 +867,14 @@ def calculate_monthly_metrics(all_metadata, agents):
682
  acceptance_rate = (merged_count / total_decisions * 100) if total_decisions > 0 else None
683
 
684
  acceptance_rates.append(acceptance_rate)
685
- total_prs.append(total_count)
686
- merged_prs.append(merged_count)
687
  closed_not_merged_list.append(closed_not_merged_count)
688
 
689
  result_data[agent_name] = {
690
  'acceptance_rates': acceptance_rates,
691
- 'total_prs': total_prs,
692
- 'merged_prs': merged_prs,
693
  'closed_not_merged': closed_not_merged_list
694
  }
695
 
@@ -702,113 +887,36 @@ def calculate_monthly_metrics(all_metadata, agents):
702
  }
703
 
704
 
705
- def load_all_pr_metadata_from_hf(agents):
706
  """
707
- Load all PR metadata from HuggingFace dataset for all agents.
708
 
709
  Args:
710
- agents: List of agent dictionaries with github_identifier
 
711
 
712
  Returns:
713
- List of PR metadata with agent_identifier field added
714
- """
715
- try:
716
- api = HfApi()
717
- token = get_hf_token()
718
-
719
- # Calculate cutoff date
720
- cutoff_date = datetime.now(timezone.utc) - timedelta(days=LEADERBOARD_TIME_FRAME_DAYS)
721
-
722
- # List all files in the repository
723
- files = list_repo_files_with_backoff(api, repo_id=PR_METADATA_REPO, repo_type="dataset")
724
-
725
- # Filter for files within the time frame
726
- relevant_files = []
727
- for f in files:
728
- if f.endswith('.jsonl'):
729
- parts = f.split('/')
730
- if len(parts) == 2:
731
- filename = parts[1]
732
- try:
733
- date_part = filename.replace('.jsonl', '')
734
- date_components = date_part.split('.')
735
- if len(date_components) == 3:
736
- file_year, file_month, file_day = map(int, date_components)
737
- file_date = datetime(file_year, file_month, file_day, tzinfo=timezone.utc)
738
-
739
- if file_date >= cutoff_date:
740
- relevant_files.append(f)
741
- except Exception:
742
- continue
743
-
744
- print(f"\n📥 Loading PR metadata from {len(relevant_files)} daily files...")
745
-
746
- all_metadata = []
747
- for filename in relevant_files:
748
- try:
749
- parts = filename.split('/')
750
- if len(parts) != 2:
751
- continue
752
-
753
- agent_identifier = parts[0]
754
-
755
- file_path = hf_hub_download_with_backoff(
756
- repo_id=PR_METADATA_REPO,
757
- filename=filename,
758
- repo_type="dataset",
759
- token=token
760
- )
761
- day_metadata = load_jsonl(file_path)
762
-
763
- # Add agent_identifier to each PR
764
- for pr_meta in day_metadata:
765
- created_at = pr_meta.get('created_at')
766
- if created_at:
767
- try:
768
- dt = datetime.fromisoformat(created_at.replace('Z', '+00:00'))
769
- if dt >= cutoff_date:
770
- pr_meta['agent_identifier'] = agent_identifier
771
- all_metadata.append(pr_meta)
772
- except Exception:
773
- continue
774
-
775
- except Exception as e:
776
- print(f" Warning: Could not load {filename}: {str(e)}")
777
-
778
- print(f"✓ Loaded {len(all_metadata)} total PRs")
779
- return all_metadata
780
-
781
- except Exception as e:
782
- print(f"✗ Error loading PR metadata: {str(e)}")
783
- return []
784
-
785
-
786
- def construct_leaderboard_from_metadata(all_metadata, agents):
787
  """
788
- Construct leaderboard data from PR metadata.
789
-
790
- Args:
791
- all_metadata: List of PR metadata with agent_identifier field
792
- agents: List of agent dictionaries
793
 
794
- Returns:
795
- Dictionary mapping agent identifier to stats
796
- """
797
  cache_dict = {}
798
 
799
  for agent in agents:
800
  identifier = agent.get('github_identifier')
801
  agent_name = agent.get('name', 'Unknown')
802
 
803
- # Filter metadata for this agent
804
- bot_metadata = [pr for pr in all_metadata if pr.get('agent_identifier') == identifier]
805
 
806
  # Calculate stats
807
  stats = calculate_pr_stats_from_metadata(bot_metadata)
808
 
809
  cache_dict[identifier] = {
810
  'name': agent_name,
811
- 'website': agent.get('website', 'Unknown'),
812
  'github_identifier': identifier,
813
  **stats
814
  }
@@ -816,16 +924,16 @@ def construct_leaderboard_from_metadata(all_metadata, agents):
816
  return cache_dict
817
 
818
 
819
- def save_leaderboard_data_to_hf(leaderboard_data, monthly_metrics):
820
  """
821
- Save computed leaderboard and monthly metrics to HuggingFace dataset as swe-pr.json.
822
 
823
  Args:
824
- leaderboard_data: Dictionary with agent stats (from construct_leaderboard)
825
- monthly_metrics: Dictionary with monthly metrics (from calculate_monthly_metrics)
826
 
827
  Returns:
828
- True if successful, False otherwise
829
  """
830
  try:
831
  token = get_hf_token()
@@ -833,39 +941,39 @@ def save_leaderboard_data_to_hf(leaderboard_data, monthly_metrics):
833
  raise Exception("No HuggingFace token found")
834
 
835
  api = HfApi(token=token)
 
836
 
837
- # Combine data into single JSON structure
838
  combined_data = {
839
- 'leaderboard': leaderboard_data,
 
840
  'monthly_metrics': monthly_metrics,
841
- 'last_updated': datetime.now(timezone.utc).isoformat()
 
 
842
  }
843
 
844
- # Save to temp file
845
- temp_file = tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.json')
846
- try:
847
- json.dump(combined_data, temp_file, indent=2)
848
- temp_file.close()
849
 
850
- # Upload to HuggingFace
851
- print(f"\n📤 Uploading leaderboard data to {LEADERBOARD_REPO}/swe-pr.json...")
852
  upload_file_with_backoff(
853
- api,
854
- path_or_fileobj=temp_file.name,
855
- path_in_repo="swe-pr.json",
856
  repo_id=LEADERBOARD_REPO,
857
  repo_type="dataset"
858
  )
859
- print(f"✓ Leaderboard data uploaded successfully")
860
  return True
861
-
862
  finally:
863
- # Clean up temp file
864
- if os.path.exists(temp_file.name):
865
- os.unlink(temp_file.name)
866
 
867
  except Exception as e:
868
- print(f"Error saving leaderboard data: {str(e)}")
869
  import traceback
870
  traceback.print_exc()
871
  return False
@@ -878,31 +986,35 @@ def save_leaderboard_data_to_hf(leaderboard_data, monthly_metrics):
878
  def mine_all_agents():
879
  """
880
  Mine PR metadata for all agents within LEADERBOARD_TIME_FRAME_DAYS and save to HuggingFace.
881
- Uses ONE BigQuery query for ALL agents (most efficient approach).
882
  """
883
- # Load agent metadata from HuggingFace
 
 
 
 
 
 
 
 
884
  agents = load_agents_from_hf()
885
  if not agents:
886
- print("No agents found in HuggingFace dataset")
887
  return
888
 
889
  # Extract all identifiers
890
  identifiers = [agent['github_identifier'] for agent in agents if agent.get('github_identifier')]
891
  if not identifiers:
892
- print("No valid agent identifiers found")
893
  return
894
 
895
- print(f"\n{'='*80}")
896
- print(f"Starting PR metadata mining for {len(identifiers)} agents")
897
- print(f"Time frame: Last {LEADERBOARD_TIME_FRAME_DAYS} days")
898
- print(f"Data source: BigQuery + GitHub Archive (BATCHED QUERIES)")
899
- print(f"{'='*80}\n")
900
 
901
- # Initialize BigQuery client
902
  try:
903
- client = get_bigquery_client()
904
  except Exception as e:
905
- print(f"Failed to initialize BigQuery client: {str(e)}")
906
  return
907
 
908
  # Define time range: past LEADERBOARD_TIME_FRAME_DAYS (excluding today)
@@ -911,68 +1023,116 @@ def mine_all_agents():
911
  start_date = end_date - timedelta(days=LEADERBOARD_TIME_FRAME_DAYS)
912
 
913
  try:
914
- # Use batched approach for better performance
915
- # upload_immediately=True means each batch uploads to HuggingFace right after BigQuery completes
916
- all_metadata = fetch_issue_metadata_batched(
917
- client, identifiers, start_date, end_date, batch_size=100, upload_immediately=True
918
  )
919
 
920
  # Calculate summary statistics
921
  total_prs = sum(len(metadata_list) for metadata_list in all_metadata.values())
922
  agents_with_data = sum(1 for metadata_list in all_metadata.values() if metadata_list)
923
 
924
- print(f"\n{'='*80}")
925
- print(f"✅ BigQuery mining and upload complete!")
926
- print(f" Total agents: {len(agents)}")
927
- print(f" Agents with data: {agents_with_data}")
928
- print(f" Total PRs found: {total_prs}")
929
- print(f"{'='*80}\n")
930
 
931
  except Exception as e:
932
- print(f"Error during BigQuery fetch: {str(e)}")
933
  import traceback
934
  traceback.print_exc()
935
  return
 
 
 
 
 
 
 
 
936
 
937
- # Compute and save leaderboard data
938
- print(f"\n{'='*80}")
939
- print(f"📊 Computing leaderboard and monthly metrics...")
940
- print(f"{'='*80}\n")
941
 
942
  try:
943
- # Load all PR metadata from HuggingFace
944
- all_pr_metadata = load_all_pr_metadata_from_hf(agents)
945
-
946
- if all_pr_metadata:
947
- # Construct leaderboard
948
- leaderboard_data = construct_leaderboard_from_metadata(all_pr_metadata, agents)
949
- print(f"✓ Computed leaderboard for {len(leaderboard_data)} agents")
950
-
951
- # Calculate monthly metrics
952
- monthly_metrics = calculate_monthly_metrics(all_pr_metadata, agents)
953
- print(f"✓ Computed monthly metrics for {len(monthly_metrics['agents'])} agents across {len(monthly_metrics['months'])} months")
954
-
955
- # Save to HuggingFace
956
- if save_leaderboard_data_to_hf(leaderboard_data, monthly_metrics):
957
- print(f"\n{'='*80}")
958
- print(f"✅ Leaderboard data saved successfully!")
959
- print(f"{'='*80}\n")
960
- else:
961
- print(f"\n{'='*80}")
962
- print(f"⚠️ Warning: Failed to save leaderboard data")
963
- print(f"{'='*80}\n")
964
- else:
965
- print(f"⚠️ No PR metadata found to compute leaderboard")
966
 
967
  except Exception as e:
968
- print(f"Error computing/saving leaderboard data: {str(e)}")
969
  import traceback
970
  traceback.print_exc()
971
 
972
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
973
  # =============================================================================
974
  # ENTRY POINT
975
  # =============================================================================
976
 
977
  if __name__ == "__main__":
978
- mine_all_agents()
 
 
 
 
 
 
1
  """
2
  Minimalist PR Metadata Mining Script
3
+ Mines PR metadata from locally downloaded GHArchive data via DuckDB and saves to HuggingFace dataset.
4
  """
5
 
6
  import json
7
  import os
8
+ import time
9
  import tempfile
10
  from datetime import datetime, timezone, timedelta
11
  from collections import defaultdict
12
+ from concurrent.futures import ThreadPoolExecutor, as_completed
13
  from huggingface_hub import HfApi, hf_hub_download
14
  from huggingface_hub.errors import HfHubHTTPError
15
  from dotenv import load_dotenv
16
+ import duckdb
17
  import backoff
18
+ import requests
19
+ import requests.exceptions
20
+ from apscheduler.schedulers.blocking import BlockingScheduler
21
+ from apscheduler.triggers.cron import CronTrigger
22
+ import logging
23
 
24
  # Load environment variables
25
  load_dotenv()
 
30
 
31
  AGENTS_REPO = "SWE-Arena/bot_metadata"
32
  PR_METADATA_REPO = "SWE-Arena/pr_metadata"
33
+ LEADERBOARD_REPO = "SWE-Arena/leaderboard_metadata" # HuggingFace dataset for leaderboard data
34
+ LEADERBOARD_TIME_FRAME_DAYS = 180 # Time frame for leaderboard
35
+ GHARCHIVE_DATA_DIR = "../gharchive/data" # Local GHArchive data directory
36
+ DUCKDB_CACHE_FILE = "../gharchive/gharchive_cache.duckdb" # Persistent DuckDB database for caching
37
+
38
+ # Download configuration
39
+ DOWNLOAD_WORKERS = 48 # Number of parallel download threads
40
+ DOWNLOAD_RETRY_DELAY = 2 # Initial retry delay in seconds
41
+ MAX_RETRIES = 5 # Maximum number of retries for each API call
42
+
43
+ # Upload configuration
44
+ UPLOAD_DELAY_SECONDS = 5 # Delay between individual file uploads to avoid rate limits
45
+ UPLOAD_INITIAL_BACKOFF = 60 # Initial backoff time in seconds (1 minute)
46
+ UPLOAD_MAX_BACKOFF = 3600 # Maximum backoff time in seconds (60 minutes)
47
+
48
+ # Scheduler configuration
49
+ SCHEDULE_ENABLED = False # Enable/disable scheduler
50
+ SCHEDULE_DAY_OF_MONTH = 8 # Day of month (1-31) - 8nd is in the second week
51
+ SCHEDULE_HOUR = 0 # Hour (0-23) - 12am midnight
52
+ SCHEDULE_MINUTE = 0 # Minute (0-59)
53
+ SCHEDULE_TIMEZONE = 'UTC' # Timezone for scheduling
54
 
55
  # =============================================================================
56
  # UTILITY FUNCTIONS
 
80
  f.write(json.dumps(item) + '\n')
81
 
82
 
83
+ def normalize_date_format(date_string):
84
  """
85
+ Convert date strings to standardized ISO 8601 format with Z suffix.
86
+ Handles both 'T' and space-separated datetime formats (including newlines).
87
+ Examples:
88
+ - 2025-10-15T23:23:47.983068 -> 2025-10-15T23:23:47Z
89
+ - 2025-06-17 21:21:07+00 -> 2025-06-17T21:21:07Z
90
  """
91
+ if not date_string or date_string == 'N/A':
92
+ return 'N/A'
93
 
94
+ try:
95
+ import re
96
+ # Remove all whitespace (spaces, newlines, tabs) and replace with single space
97
+ date_string = re.sub(r'\s+', ' ', date_string.strip())
 
98
 
99
+ # Replace space with 'T' for ISO format compatibility
100
+ date_string = date_string.replace(' ', 'T')
101
 
102
+ # Fix incomplete timezone offset (+00 or -00 -> +00:00 or -00:00)
103
+ # Check if timezone offset exists and is incomplete
104
+ if len(date_string) >= 3:
105
+ if date_string[-3:-2] in ('+', '-') and ':' not in date_string[-3:]:
106
+ date_string = date_string + ':00'
107
 
108
+ # Parse the date string (handles both with and without microseconds)
109
+ dt = datetime.fromisoformat(date_string.replace('Z', '+00:00'))
110
+
111
+ # Convert to standardized format
112
+ return dt.strftime('%Y-%m-%dT%H:%M:%SZ')
113
+ except Exception as e:
114
+ print(f"Warning: Could not parse date '{date_string}': {e}")
115
+ return date_string
116
+
117
+
118
+ def get_hf_token():
119
+ """Get HuggingFace token from environment variables."""
120
+ token = os.getenv('HF_TOKEN')
121
+ if not token:
122
+ print("Warning: HF_TOKEN not found in environment variables")
123
+ return token
124
 
 
 
 
125
 
126
+ # =============================================================================
127
+ # GHARCHIVE DOWNLOAD FUNCTIONS
128
+ # =============================================================================
129
 
130
+ def download_file(url):
131
  """
132
+ Download a GHArchive file with retry logic.
133
 
134
  Args:
135
+ url: URL to download
 
136
 
137
  Returns:
138
+ bool: True if successful, False otherwise
139
  """
140
+ filename = url.split("/")[-1]
141
+ filepath = os.path.join(GHARCHIVE_DATA_DIR, filename)
142
 
143
+ # Skip if json.gz already exists
144
+ if os.path.exists(filepath):
145
+ return True
146
 
147
+ # Download with retry logic
148
+ for attempt in range(MAX_RETRIES):
149
+ try:
150
+ response = requests.get(url, timeout=30)
151
+ response.raise_for_status()
152
+ with open(filepath, "wb") as f:
153
+ f.write(response.content)
154
+ return True
155
 
156
+ except requests.exceptions.HTTPError as e:
157
+ if e.response.status_code == 404:
158
+ # File doesn't exist, don't retry
159
+ return False
160
+ else:
161
+ # Other HTTP errors, retry
162
+ if attempt < MAX_RETRIES - 1:
163
+ wait_time = DOWNLOAD_RETRY_DELAY * (2 ** attempt) # Exponential backoff
164
+ print(f" ⚠ {filename}: HTTP error {e.response.status_code}, retrying in {wait_time}s (attempt {attempt + 1}/{MAX_RETRIES})")
165
+ time.sleep(wait_time)
166
+ else:
167
+ print(f" ✗ {filename}: Failed after {MAX_RETRIES} attempts - {e}")
168
+
169
+ except (requests.exceptions.Timeout,
170
+ requests.exceptions.ConnectionError,
171
+ requests.exceptions.ReadTimeout) as e:
172
+ # Timeout/connection errors, retry
173
+ if attempt < MAX_RETRIES - 1:
174
+ wait_time = DOWNLOAD_RETRY_DELAY * (2 ** attempt) # Exponential backoff
175
+ print(f" ⚠ {filename}: {type(e).__name__}, retrying in {wait_time}s (attempt {attempt + 1}/{MAX_RETRIES})")
176
+ time.sleep(wait_time)
177
+ else:
178
+ print(f" ✗ {filename}: Failed after {MAX_RETRIES} attempts - {type(e).__name__}")
179
+
180
+ except Exception as e:
181
+ # Other errors, retry
182
+ if attempt < MAX_RETRIES - 1:
183
+ wait_time = DOWNLOAD_RETRY_DELAY * (2 ** attempt)
184
+ print(f" ⚠ {filename}: {e}, retrying in {wait_time}s (attempt {attempt + 1}/{MAX_RETRIES})")
185
+ time.sleep(wait_time)
186
+ else:
187
+ print(f" ✗ {filename}: Failed after {MAX_RETRIES} attempts - {e}")
188
 
189
+ return False
 
 
190
 
191
 
192
+ def download_all_gharchive_data():
193
+ """
194
+ Download all GHArchive data files for the last LEADERBOARD_TIME_FRAME_DAYS.
195
+ Uses parallel downloads with ThreadPoolExecutor.
196
+
197
+ Returns:
198
+ bool: True if all downloads completed (some may have failed), False if critical error
199
+ """
200
+ # Create data directory if it doesn't exist
201
+ os.makedirs(GHARCHIVE_DATA_DIR, exist_ok=True)
202
+
203
+ # Generate URLs for last N days (hourly files: 0-23 for each day)
204
+ end_date = datetime.now()
205
+ start_date = end_date - timedelta(days=LEADERBOARD_TIME_FRAME_DAYS)
206
+
207
+ urls = []
208
+ current_date = start_date
209
+ while current_date <= end_date:
210
+ date_str = current_date.strftime("%Y-%m-%d")
211
+ # Generate hourly URLs for this day (0-23)
212
+ for hour in range(24):
213
+ url = f"https://data.gharchive.org/{date_str}-{hour}.json.gz"
214
+ urls.append(url)
215
+ current_date += timedelta(days=1)
216
+
217
+ downloads_processed = 0
218
+
219
+ try:
220
+ with ThreadPoolExecutor(max_workers=DOWNLOAD_WORKERS) as executor:
221
+ # Submit all downloads
222
+ futures = [executor.submit(download_file, url) for url in urls]
223
+
224
+ # Wait for downloads to complete
225
+ for future in as_completed(futures):
226
+ downloads_processed += 1
227
+
228
+ print(f"Download complete: {downloads_processed} files")
229
+ return True
230
+
231
+ except Exception as e:
232
+ print(f"Error during download: {str(e)}")
233
+ import traceback
234
+ traceback.print_exc()
235
+ return False
236
 
237
 
238
  # =============================================================================
239
+ # HUGGINGFACE API WRAPPERS WITH ENHANCED BACKOFF
240
  # =============================================================================
241
 
242
+ def is_retryable_error(e):
243
+ """
244
+ Check if exception is retryable (rate limit or timeout error).
245
+ """
246
+ # Check for rate limit error (429)
247
  if isinstance(e, HfHubHTTPError):
248
+ if e.response.status_code == 429:
249
+ return True
250
 
251
+ # Check for timeout errors
252
+ if isinstance(e, (requests.exceptions.Timeout,
253
+ requests.exceptions.ReadTimeout,
254
+ requests.exceptions.ConnectTimeout)):
255
+ return True
256
 
257
+ # Check if it's a timeout error wrapped in HfHubHTTPError
258
+ if isinstance(e, Exception):
259
+ error_str = str(e).lower()
260
+ if 'timeout' in error_str or 'timed out' in error_str:
261
+ return True
262
+
263
+ return False
264
 
265
 
266
  @backoff.on_exception(
267
  backoff.expo,
268
+ (HfHubHTTPError, requests.exceptions.Timeout, requests.exceptions.RequestException, Exception),
 
269
  max_tries=8,
270
+ base=300,
271
+ max_value=3600,
272
+ giveup=lambda e: not is_retryable_error(e),
273
+ on_backoff=lambda details: print(
274
+ f" {details['exception']} error. Retrying in {details['wait']/60:.1f} minutes ({details['wait']:.0f}s) - attempt {details['tries']}/8..."
275
+ )
276
  )
277
  def list_repo_files_with_backoff(api, **kwargs):
278
+ """Wrapper for api.list_repo_files() with exponential backoff for retryable errors."""
279
  return api.list_repo_files(**kwargs)
280
 
281
 
282
  @backoff.on_exception(
283
  backoff.expo,
284
+ (HfHubHTTPError, requests.exceptions.Timeout, requests.exceptions.RequestException, Exception),
 
285
  max_tries=8,
286
+ base=300,
287
+ max_value=3600,
288
+ giveup=lambda e: not is_retryable_error(e),
289
+ on_backoff=lambda details: print(
290
+ f" {details['exception']} error. Retrying in {details['wait']/60:.1f} minutes ({details['wait']:.0f}s) - attempt {details['tries']}/8..."
291
+ )
292
  )
293
  def hf_hub_download_with_backoff(**kwargs):
294
+ """Wrapper for hf_hub_download() with exponential backoff for retryable errors."""
295
  return hf_hub_download(**kwargs)
296
 
297
 
298
  @backoff.on_exception(
299
  backoff.expo,
300
+ (HfHubHTTPError, requests.exceptions.Timeout, requests.exceptions.RequestException, Exception),
 
301
  max_tries=8,
302
+ base=300,
303
+ max_value=3600,
304
+ giveup=lambda e: not is_retryable_error(e),
305
+ on_backoff=lambda details: print(
306
+ f" {details['exception']} error. Retrying in {details['wait']/60:.1f} minutes ({details['wait']:.0f}s) - attempt {details['tries']}/8..."
307
+ )
308
  )
309
+ def upload_file_with_backoff(api, **kwargs):
310
+ """Wrapper for api.upload_file() with exponential backoff for retryable errors."""
311
+ return api.upload_file(**kwargs)
312
 
313
 
314
  @backoff.on_exception(
315
  backoff.expo,
316
+ (HfHubHTTPError, requests.exceptions.Timeout, requests.exceptions.RequestException, Exception),
 
317
  max_tries=8,
318
+ base=300,
319
+ max_value=3600,
320
+ giveup=lambda e: not is_retryable_error(e),
321
+ on_backoff=lambda details: print(
322
+ f" {details['exception']} error. Retrying in {details['wait']/60:.1f} minutes ({details['wait']:.0f}s) - attempt {details['tries']}/8..."
323
+ )
324
  )
325
+ def upload_folder_with_backoff(api, **kwargs):
326
+ """Wrapper for api.upload_folder() with exponential backoff for retryable errors."""
327
+ return api.upload_folder(**kwargs)
 
328
 
 
 
 
329
 
330
+ def get_duckdb_connection():
331
  """
332
+ Initialize DuckDB connection with persistent database and optimized parallelization.
 
 
 
 
 
 
 
 
 
333
 
334
  Returns:
335
+ DuckDB connection object
336
  """
337
+ # Use persistent database for caching results
338
+ conn = duckdb.connect(DUCKDB_CACHE_FILE)
 
339
 
340
+ # Optimize for 96-core CPU parallelization with 754GB RAM
341
+ conn.execute("SET threads TO 8;") # Use all available cores
342
+ conn.execute("SET preserve_insertion_order = false;") # Better parallelization
343
+ conn.execute("SET enable_object_cache = true;") # Cache objects for reuse
344
+ conn.execute("SET temp_directory = '/tmp/duckdb_temp';") # Use fast temp storage if needed
 
 
345
 
346
+ return conn
 
347
 
 
 
348
 
349
+ def generate_file_path_patterns(start_date, end_date, data_dir=GHARCHIVE_DATA_DIR):
350
+ """
351
+ Generate file path patterns for GHArchive data in date range.
352
+ Only includes files that actually exist on disk.
 
353
 
354
+ Args:
355
+ start_date: Start datetime
356
+ end_date: End datetime
357
+ data_dir: Directory containing GHArchive data files
 
 
358
 
359
+ Returns:
360
+ List of file path patterns (hourly JSON.gz files) that exist
361
+ """
362
+ file_patterns = []
363
+ missing_dates = set()
364
 
365
+ current_date = start_date.replace(hour=0, minute=0, second=0, microsecond=0)
366
+ end_day = end_date.replace(hour=0, minute=0, second=0, microsecond=0)
 
 
 
367
 
368
+ while current_date <= end_day:
369
+ # Pattern for hourly JSON.gz files: 2024-11-15-{0..23}.json.gz
370
+ date_has_files = False
371
+ for hour in range(24):
372
+ pattern = os.path.join(data_dir, f"{current_date.strftime('%Y-%m-%d')}-{hour}.json.gz")
373
+ # Only add pattern if file exists
374
+ if os.path.exists(pattern):
375
+ file_patterns.append(pattern)
376
+ date_has_files = True
377
 
378
+ # Track missing dates
379
+ if not date_has_files:
380
+ missing_dates.add(current_date.strftime('%Y-%m-%d'))
381
 
382
+ # Move to next day
383
+ current_date += timedelta(days=1)
 
 
384
 
385
+ # Print warning about missing dates
386
+ if missing_dates:
387
+ print(f" Warning: Skipping {len(missing_dates)} date(s) with no data files: {', '.join(sorted(missing_dates))}")
388
 
389
+ return file_patterns
390
 
391
 
392
+ # =============================================================================
393
+ # DUCKDB QUERY FUNCTIONS
394
+ # =============================================================================
395
 
396
+ def fetch_all_pr_metadata_single_query(conn, identifiers, start_date, end_date):
397
+ """
398
+ Fetch PR metadata for ALL agents using ONE comprehensive DuckDB query.
399
 
400
  This query fetches:
401
  1. PRs authored by agents (user.login matches identifier)
402
+ 2. PR status (opened, merged, closed)
403
 
404
  Args:
405
+ conn: DuckDB connection instance
406
  identifiers: List of GitHub usernames/bot identifiers
407
  start_date: Start datetime (timezone-aware)
408
  end_date: End datetime (timezone-aware)
 
412
  {
413
  'agent-identifier': [
414
  {
415
+ 'html_url': PR URL,
416
  'created_at': Creation timestamp,
417
  'merged_at': Merge timestamp (if merged, else None),
418
  'closed_at': Close timestamp (if closed but not merged, else None)
 
422
  ...
423
  }
424
  """
425
+ # Generate file path patterns for the time range
426
+ file_patterns = generate_file_path_patterns(start_date, end_date)
427
 
428
+ # Build identifier list for IN clause
429
+ identifier_list = ', '.join([f"'{id}'" for id in identifiers])
430
 
431
+ # Build comprehensive query with CTEs using parameterized file lists (JSON.gz format)
 
 
 
432
  query = f"""
433
  WITH pr_events AS (
434
  -- Get all PR events (opened, closed) for all agents
435
  SELECT
436
+ TRY_CAST(json_extract_string(payload, '$.pull_request.html_url') AS VARCHAR) as url,
437
+ TRY_CAST(json_extract_string(payload, '$.pull_request.user.login') AS VARCHAR) as pr_author,
438
+ TRY_CAST(json_extract_string(payload, '$.pull_request.created_at') AS VARCHAR) as created_at,
439
+ TRY_CAST(json_extract_string(payload, '$.pull_request.merged') AS BOOLEAN) as is_merged,
440
+ TRY_CAST(json_extract_string(payload, '$.pull_request.merged_at') AS VARCHAR) as merged_at,
441
+ TRY_CAST(json_extract_string(payload, '$.pull_request.closed_at') AS VARCHAR) as closed_at,
442
+ TRY_CAST(json_extract_string(payload, '$.action') AS VARCHAR) as action,
443
  created_at as event_time
444
+ FROM read_json($file_patterns, union_by_name=true, filename=true, compression='gzip', format='newline_delimited', ignore_errors=true, maximum_object_size=2147483648)
 
 
445
  WHERE
446
+ TRY_CAST(type AS VARCHAR) = 'PullRequestEvent'
447
+ AND json_extract_string(payload, '$.pull_request.html_url') IS NOT NULL
448
+ AND TRY_CAST(json_extract_string(payload, '$.pull_request.user.login') AS VARCHAR) IN ({identifier_list})
449
  ),
450
 
451
  pr_latest_state AS (
 
472
  ORDER BY created_at DESC
473
  """
474
 
 
 
 
475
  try:
476
+ # Create cache table name based on date range
477
+ cache_table_name = f"pr_cache_{start_date.strftime('%Y%m%d')}_{end_date.strftime('%Y%m%d')}"
478
+
479
+ # Check if cache exists and is valid
480
+ cache_exists = conn.execute(f"""
481
+ SELECT COUNT(*) FROM information_schema.tables
482
+ WHERE table_name = '{cache_table_name}'
483
+ """).fetchone()[0] > 0
484
+
485
+ if cache_exists:
486
+ results = conn.execute(f"""
487
+ SELECT url, pr_author, created_at, merged_at, closed_at
488
+ FROM {cache_table_name}
489
+ WHERE pr_author IN ({identifier_list})
490
+ """).fetchall()
491
+ else:
492
+ # Execute query with parameters
493
+ results = conn.execute(query, {'file_patterns': file_patterns}).fetchall()
494
+
495
+ # Cache the complete results for all future queries in this date range
496
+ if len(results) > 0:
497
+ conn.execute(f"""
498
+ CREATE TABLE {cache_table_name} AS
499
+ SELECT * FROM (
500
+ SELECT UNNEST($1) as url, UNNEST($2) as pr_author,
501
+ UNNEST($3) as created_at, UNNEST($4) as merged_at,
502
+ UNNEST($5) as closed_at
503
+ )
504
+ """, [
505
+ [r[0] for r in results],
506
+ [r[1] for r in results],
507
+ [r[2] for r in results],
508
+ [r[3] for r in results],
509
+ [r[4] for r in results]
510
+ ])
511
 
512
  # Group results by agent
513
  metadata_by_agent = defaultdict(list)
514
 
515
  for row in results:
516
+ url = row[0]
517
+ pr_author = row[1]
518
+ created_at = normalize_date_format(row[2]) if row[2] else None
519
+ merged_at = normalize_date_format(row[3]) if row[3] else None
520
+ closed_at = normalize_date_format(row[4]) if row[4] else None
521
+
522
+ metadata_by_agent[pr_author].append({
523
+ 'html_url': url,
 
 
 
 
 
 
 
524
  'created_at': created_at,
525
  'merged_at': merged_at,
526
  'closed_at': closed_at,
527
+ })
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
528
 
529
  # Convert defaultdict to regular dict
530
  return dict(metadata_by_agent)
531
 
532
  except Exception as e:
533
+ print(f"DuckDB error: {str(e)}")
534
  import traceback
535
  traceback.print_exc()
536
  return {}
537
 
538
 
539
  # =============================================================================
540
+ # HUGGINGFACE STORAGE FUNCTIONS WITH BATCH UPLOAD
541
  # =============================================================================
542
 
543
  def group_metadata_by_date(metadata_list):
544
  """
545
+ Group PR metadata by date (year.month.day) for daily storage.
546
  Returns dict: {(year, month, day): [metadata_list]}
547
  """
548
  grouped = defaultdict(list)
 
562
  return dict(grouped)
563
 
564
 
565
+ def upload_single_file_with_retry(api, local_path, repo_path, repo_id, repo_type, commit_message, max_retries=MAX_RETRIES):
566
  """
567
+ Upload a single file with exponential backoff retry logic.
 
 
 
 
568
 
569
  Args:
570
+ api: HfApi instance
571
+ local_path: Local file path
572
+ repo_path: Path in repository
573
+ repo_id: Repository ID
574
+ repo_type: Repository type (e.g., "dataset")
575
+ commit_message: Commit message
576
+ max_retries: Maximum number of retries
577
+
578
+ Returns:
579
+ bool: True if successful, False otherwise
580
  """
581
+ for attempt in range(max_retries):
582
+ try:
583
+ upload_file_with_backoff(
584
+ api=api,
585
+ path_or_fileobj=local_path,
586
+ path_in_repo=repo_path,
587
+ repo_id=repo_id,
588
+ repo_type=repo_type,
589
+ commit_message=commit_message
590
+ )
591
+ return True
592
+ except Exception as e:
593
+ if attempt < max_retries - 1:
594
+ # Calculate exponential backoff
595
+ wait_time = min(UPLOAD_INITIAL_BACKOFF * (2 ** attempt), UPLOAD_MAX_BACKOFF)
596
+ print(f" {e} error on attempt {attempt + 1}/{max_retries}. Retrying in {wait_time}s...")
597
+ time.sleep(wait_time)
598
+ else:
599
+ print(f" Failed after {max_retries} attempts: {str(e)}")
600
+ return False
601
+ return False
602
+
603
+
604
+ def batch_upload_pr_metadata(all_metadata):
605
+ """
606
+ Upload PR metadata for all agents with time gaps between uploads.
607
+ Each agent's data is uploaded as separate daily files with retry logic.
608
+
609
+ Args:
610
+ all_metadata: Dictionary mapping agent identifier to list of PR metadata
611
 
612
+ Returns:
613
+ tuple: (success_count, error_count)
614
+ """
615
  try:
616
  token = get_hf_token()
617
  if not token:
 
619
 
620
  api = HfApi(token=token)
621
 
622
+ success_count = 0
623
+ error_count = 0
624
+ total_files = 0
625
 
626
+ # First, calculate total number of files to upload
627
+ for agent_identifier, metadata_list in all_metadata.items():
628
+ if metadata_list:
629
+ grouped = group_metadata_by_date(metadata_list)
630
+ total_files += len(grouped)
631
 
632
+ print(f"Uploading {total_files} files for {len(all_metadata)} agents...")
 
 
 
633
 
634
+ file_count = 0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
635
 
636
+ for agent_idx, (agent_identifier, metadata_list) in enumerate(all_metadata.items(), 1):
637
+ if not metadata_list:
638
+ continue
639
 
640
+ # Group by date
641
+ grouped = group_metadata_by_date(metadata_list)
642
+
643
+ # Create temporary files for this agent
644
+ agent_temp_dir = tempfile.mkdtemp()
645
+
646
+ try:
647
+ # Prepare all files locally
648
+ local_files = []
649
+ for (pr_year, month, day), day_metadata in grouped.items():
650
+ filename = f"{pr_year}.{month:02d}.{day:02d}.jsonl"
651
+ local_path = os.path.join(agent_temp_dir, filename)
652
+ repo_path = f"{agent_identifier}/{filename}"
653
+
654
+ # Sort by created_at for better organization
655
+ day_metadata.sort(key=lambda x: x.get('created_at', ''), reverse=True)
656
+
657
+ # Save to temp file
658
+ save_jsonl(local_path, day_metadata)
659
+ local_files.append((local_path, repo_path, len(day_metadata)))
660
+
661
+ # Upload each file with delay
662
+ agent_success = 0
663
+ agent_error = 0
664
+
665
+ for file_idx, (local_path, repo_path, pr_count) in enumerate(local_files, 1):
666
+ file_count += 1
667
+
668
+ if upload_single_file_with_retry(
669
+ api=api,
670
+ local_path=local_path,
671
+ repo_path=repo_path,
672
+ repo_id=PR_METADATA_REPO,
673
+ repo_type="dataset",
674
+ commit_message=f"Update {repo_path}",
675
+ max_retries=MAX_RETRIES
676
+ ):
677
+ agent_success += 1
678
+ success_count += 1
679
+ else:
680
+ agent_error += 1
681
+ error_count += 1
682
+
683
+ # Add delay between uploads (except for last file)
684
+ if file_idx < len(local_files):
685
+ time.sleep(UPLOAD_DELAY_SECONDS)
686
+
687
+ finally:
688
+ # Clean up temp directory
689
+ if os.path.exists(agent_temp_dir):
690
+ import shutil
691
+ shutil.rmtree(agent_temp_dir)
692
+
693
+ if error_count > 0:
694
+ print(f"Upload complete: {success_count}/{total_files} succeeded, {error_count} errors")
695
+ else:
696
+ print(f"Upload complete: {success_count}/{total_files} files")
697
+
698
+ return success_count, error_count
699
 
700
  except Exception as e:
701
+ print(f"Error during batch upload: {str(e)}")
702
  import traceback
703
  traceback.print_exc()
704
+ return 0, total_files if 'total_files' in locals() else 0
705
 
706
 
707
  def load_agents_from_hf():
 
715
  agents = []
716
 
717
  # List all files in the repository
718
+ files = list_repo_files_with_backoff(api=api, repo_id=AGENTS_REPO, repo_type="dataset")
719
 
720
  # Filter for JSON files only
721
  json_files = [f for f in files if f.endswith('.json')]
722
 
 
 
723
  # Download and parse each JSON file
724
  for json_file in json_files:
725
  try:
 
743
  agents.append(agent_data)
744
 
745
  except Exception as e:
746
+ print(f"Error loading {json_file}: {str(e)}")
747
  continue
748
 
749
+ print(f"Download complete: {len(agents)} agents")
750
+
751
  return agents
752
 
753
  except Exception as e:
 
786
  }
787
 
788
 
789
+ def calculate_monthly_metrics_by_agent(all_metadata_dict, agents):
790
  """
791
  Calculate monthly metrics for all agents for visualization.
792
 
793
  Args:
794
+ all_metadata_dict: Dictionary mapping agent identifier to list of PR metadata
795
+ agents: List of agent dictionaries with metadata
796
 
797
  Returns:
798
+ dict: {
799
+ 'agents': list of agent names,
800
+ 'months': list of month labels (e.g., '2025-01'),
801
+ 'data': {
802
+ agent_name: {
803
+ 'acceptance_rates': list of acceptance rates by month,
804
+ 'total_prs': list of PR counts by month,
805
+ 'merged_prs': list of merged PR counts by month,
806
+ }
807
+ }
808
+ }
809
  """
 
 
810
  # Create mapping from agent_identifier to agent_name
811
+ identifier_to_name = {agent.get('github_identifier'): agent.get('name') for agent in agents if agent.get('github_identifier')}
812
+
813
+ if not all_metadata_dict:
814
+ return {'agents': [], 'months': [], 'data': {}}
 
815
 
816
  # Group by agent and month
817
  agent_month_data = defaultdict(lambda: defaultdict(list))
818
 
819
+ # Flatten the dict of lists into a single list with agent_identifier added
820
+ for agent_identifier, metadata_list in all_metadata_dict.items():
821
+ for pr_meta in metadata_list:
822
+ created_at = pr_meta.get('created_at')
823
 
824
+ if not created_at:
825
+ continue
826
 
827
+ # Get agent_name from identifier
828
+ agent_name = identifier_to_name.get(agent_identifier, agent_identifier)
829
 
830
+ try:
831
+ dt = datetime.fromisoformat(created_at.replace('Z', '+00:00'))
832
+ month_key = f"{dt.year}-{dt.month:02d}"
833
+ agent_month_data[agent_name][month_key].append(pr_meta)
834
+ except Exception as e:
835
+ print(f"Warning: Could not parse date '{created_at}': {e}")
836
+ continue
837
 
838
  # Get all unique months and sort them
839
  all_months = set()
 
845
  result_data = {}
846
  for agent_name, month_dict in agent_month_data.items():
847
  acceptance_rates = []
848
+ total_prs_list = []
849
+ merged_prs_list = []
850
  closed_not_merged_list = []
851
 
852
  for month in months:
 
867
  acceptance_rate = (merged_count / total_decisions * 100) if total_decisions > 0 else None
868
 
869
  acceptance_rates.append(acceptance_rate)
870
+ total_prs_list.append(total_count)
871
+ merged_prs_list.append(merged_count)
872
  closed_not_merged_list.append(closed_not_merged_count)
873
 
874
  result_data[agent_name] = {
875
  'acceptance_rates': acceptance_rates,
876
+ 'total_prs': total_prs_list,
877
+ 'merged_prs': merged_prs_list,
878
  'closed_not_merged': closed_not_merged_list
879
  }
880
 
 
887
  }
888
 
889
 
890
+ def construct_leaderboard_from_metadata(all_metadata_dict, agents):
891
  """
892
+ Construct leaderboard from in-memory PR metadata.
893
 
894
  Args:
895
+ all_metadata_dict: Dictionary mapping agent identifier to list of PR metadata
896
+ agents: List of agent dictionaries with metadata
897
 
898
  Returns:
899
+ Dictionary of agent stats.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
900
  """
901
+ if not agents:
902
+ print("Error: No agents found")
903
+ return {}
 
 
904
 
 
 
 
905
  cache_dict = {}
906
 
907
  for agent in agents:
908
  identifier = agent.get('github_identifier')
909
  agent_name = agent.get('name', 'Unknown')
910
 
911
+ # Get metadata for this agent from the dictionary
912
+ bot_metadata = all_metadata_dict.get(identifier, [])
913
 
914
  # Calculate stats
915
  stats = calculate_pr_stats_from_metadata(bot_metadata)
916
 
917
  cache_dict[identifier] = {
918
  'name': agent_name,
919
+ 'website': agent.get('website', 'N/A'),
920
  'github_identifier': identifier,
921
  **stats
922
  }
 
924
  return cache_dict
925
 
926
 
927
+ def save_leaderboard_data_to_hf(leaderboard_dict, monthly_metrics):
928
  """
929
+ Save leaderboard data and monthly metrics to HuggingFace dataset as swe-pr.json.
930
 
931
  Args:
932
+ leaderboard_dict: Dictionary of agent stats from construct_leaderboard_from_metadata()
933
+ monthly_metrics: Monthly metrics data from calculate_monthly_metrics_by_agent()
934
 
935
  Returns:
936
+ bool: True if successful, False otherwise
937
  """
938
  try:
939
  token = get_hf_token()
 
941
  raise Exception("No HuggingFace token found")
942
 
943
  api = HfApi(token=token)
944
+ filename = "swe-pr.json"
945
 
946
+ # Combine leaderboard and monthly metrics
947
  combined_data = {
948
+ 'last_updated': datetime.now(timezone.utc).isoformat(),
949
+ 'leaderboard': leaderboard_dict,
950
  'monthly_metrics': monthly_metrics,
951
+ 'metadata': {
952
+ 'leaderboard_time_frame_days': LEADERBOARD_TIME_FRAME_DAYS
953
+ }
954
  }
955
 
956
+ # Save locally first
957
+ with open(filename, 'w') as f:
958
+ json.dump(combined_data, f, indent=2)
 
 
959
 
960
+ try:
961
+ # Upload to HuggingFace with retry logic
962
  upload_file_with_backoff(
963
+ api=api,
964
+ path_or_fileobj=filename,
965
+ path_in_repo=filename,
966
  repo_id=LEADERBOARD_REPO,
967
  repo_type="dataset"
968
  )
 
969
  return True
 
970
  finally:
971
+ # Always clean up local file
972
+ if os.path.exists(filename):
973
+ os.remove(filename)
974
 
975
  except Exception as e:
976
+ print(f"Error saving leaderboard data: {str(e)}")
977
  import traceback
978
  traceback.print_exc()
979
  return False
 
986
  def mine_all_agents():
987
  """
988
  Mine PR metadata for all agents within LEADERBOARD_TIME_FRAME_DAYS and save to HuggingFace.
989
+ Downloads GHArchive data first, then uses ONE DuckDB query for ALL agents, then batch uploads with time gaps.
990
  """
991
+ # Step 1: Download GHArchive data
992
+ print(f"\n[1/5] Downloading GHArchive data...")
993
+
994
+ if not download_all_gharchive_data():
995
+ print("Warning: Download had errors, continuing with available data...")
996
+
997
+ # Step 2: Load agent metadata from HuggingFace
998
+ print(f"\n[2/5] Loading agent metadata...")
999
+
1000
  agents = load_agents_from_hf()
1001
  if not agents:
1002
+ print("Error: No agents found")
1003
  return
1004
 
1005
  # Extract all identifiers
1006
  identifiers = [agent['github_identifier'] for agent in agents if agent.get('github_identifier')]
1007
  if not identifiers:
1008
+ print("Error: No valid agent identifiers found")
1009
  return
1010
 
1011
+ print(f"\n[3/5] Mining PR metadata ({len(identifiers)} agents, {LEADERBOARD_TIME_FRAME_DAYS} days)...")
 
 
 
 
1012
 
1013
+ # Initialize DuckDB connection
1014
  try:
1015
+ conn = get_duckdb_connection()
1016
  except Exception as e:
1017
+ print(f"Failed to initialize DuckDB connection: {str(e)}")
1018
  return
1019
 
1020
  # Define time range: past LEADERBOARD_TIME_FRAME_DAYS (excluding today)
 
1023
  start_date = end_date - timedelta(days=LEADERBOARD_TIME_FRAME_DAYS)
1024
 
1025
  try:
1026
+ # Use single query for all agents
1027
+ all_metadata = fetch_all_pr_metadata_single_query(
1028
+ conn, identifiers, start_date, end_date
 
1029
  )
1030
 
1031
  # Calculate summary statistics
1032
  total_prs = sum(len(metadata_list) for metadata_list in all_metadata.values())
1033
  agents_with_data = sum(1 for metadata_list in all_metadata.values() if metadata_list)
1034
 
1035
+ print(f"Query complete: {total_prs} PRs found for {agents_with_data}/{len(agents)} agents")
 
 
 
 
 
1036
 
1037
  except Exception as e:
1038
+ print(f"Error during DuckDB fetch: {str(e)}")
1039
  import traceback
1040
  traceback.print_exc()
1041
  return
1042
+ finally:
1043
+ # Close DuckDB connection
1044
+ conn.close()
1045
+
1046
+ # Step 4: Batch upload PR metadata with time gaps
1047
+ print(f"\n[4/5] Uploading PR metadata...")
1048
+
1049
+ success_count, error_count = batch_upload_pr_metadata(all_metadata)
1050
 
1051
+ # Step 5: Construct and save leaderboard data
1052
+ print(f"\n[5/5] Saving leaderboard...")
 
 
1053
 
1054
  try:
1055
+ # Construct leaderboard from in-memory data
1056
+ leaderboard_dict = construct_leaderboard_from_metadata(all_metadata, agents)
1057
+
1058
+ # Calculate monthly metrics from in-memory data
1059
+ monthly_metrics = calculate_monthly_metrics_by_agent(all_metadata, agents)
1060
+
1061
+ # Save to HuggingFace
1062
+ save_leaderboard_data_to_hf(leaderboard_dict, monthly_metrics)
1063
+
1064
+ print(f"\nCOMPLETE: {success_count} files uploaded" + (f", {error_count} errors" if error_count > 0 else ""))
 
 
 
 
 
 
 
 
 
 
 
 
 
1065
 
1066
  except Exception as e:
1067
+ print(f"Error saving leaderboard: {str(e)}")
1068
  import traceback
1069
  traceback.print_exc()
1070
 
1071
 
1072
+ # =============================================================================
1073
+ # SCHEDULER SETUP
1074
+ # =============================================================================
1075
+
1076
+ def setup_scheduler():
1077
+ """
1078
+ Set up APScheduler to run mining jobs periodically.
1079
+ Schedule is configurable via environment variables.
1080
+
1081
+ Environment variables:
1082
+ - SCHEDULE_ENABLED: Enable/disable scheduler (default: true)
1083
+ - SCHEDULE_DAY_OF_MONTH: Day of month to run (default: 8, second week)
1084
+ - SCHEDULE_HOUR: Hour to run (0-23, default: 0)
1085
+ - SCHEDULE_MINUTE: Minute to run (0-59, default: 0)
1086
+ - SCHEDULE_TIMEZONE: Timezone for scheduling (default: UTC)
1087
+ """
1088
+ # Configure logging for APScheduler
1089
+ logging.basicConfig(
1090
+ level=logging.INFO,
1091
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
1092
+ )
1093
+
1094
+ # Disable verbose HTTP request logging from httpx (used by huggingface_hub)
1095
+ logging.getLogger('httpx').setLevel(logging.WARNING)
1096
+
1097
+ # Create scheduler
1098
+ scheduler = BlockingScheduler(timezone=SCHEDULE_TIMEZONE)
1099
+
1100
+ # Create cron trigger with configured schedule (monthly on specific day)
1101
+ trigger = CronTrigger(
1102
+ day=SCHEDULE_DAY_OF_MONTH,
1103
+ hour=SCHEDULE_HOUR,
1104
+ minute=SCHEDULE_MINUTE,
1105
+ timezone=SCHEDULE_TIMEZONE
1106
+ )
1107
+
1108
+ # Add job to scheduler
1109
+ scheduler.add_job(
1110
+ mine_all_agents,
1111
+ trigger=trigger,
1112
+ id='mine_all_agents',
1113
+ name='Mine GHArchive data for all agents',
1114
+ replace_existing=True
1115
+ )
1116
+
1117
+ # Print schedule information
1118
+ from datetime import datetime
1119
+ next_run = trigger.get_next_fire_time(None, datetime.now(trigger.timezone))
1120
+ print(f"Scheduler: Monthly on day {SCHEDULE_DAY_OF_MONTH} at {SCHEDULE_HOUR:02d}:{SCHEDULE_MINUTE:02d} {SCHEDULE_TIMEZONE}")
1121
+ print(f"Next run: {next_run}\n")
1122
+
1123
+ # Start scheduler (blocking call)
1124
+ print(f"\nScheduler started")
1125
+ scheduler.start()
1126
+
1127
+
1128
  # =============================================================================
1129
  # ENTRY POINT
1130
  # =============================================================================
1131
 
1132
  if __name__ == "__main__":
1133
+ if SCHEDULE_ENABLED:
1134
+ # Run with scheduler
1135
+ setup_scheduler()
1136
+ else:
1137
+ # Run without scheduler, just mine once
1138
+ mine_all_agents()
requirements.txt CHANGED
@@ -1,12 +1,10 @@
1
  APScheduler
2
  backoff
3
- datasets
4
- db-dtypes
5
- google-cloud-bigquery
6
  gradio
7
  gradio_leaderboard
8
  huggingface_hub
9
  pandas
10
  plotly
11
- PyGithub
12
- python-dotenv
 
1
  APScheduler
2
  backoff
3
+ duckdb[all]
 
 
4
  gradio
5
  gradio_leaderboard
6
  huggingface_hub
7
  pandas
8
  plotly
9
+ python-dotenv
10
+ requests