zhiminy commited on
Commit
88db242
·
1 Parent(s): 19a4aed
Files changed (3) hide show
  1. README.md +2 -2
  2. app.py +15 -135
  3. msr.py +1 -44
README.md CHANGED
@@ -59,13 +59,13 @@ We search GitHub using multiple query patterns to catch all PRs associated with
59
  The leaderboard refreshes automatically every day at 12:00 AM UTC.
60
 
61
  **Community Submissions**
62
- Anyone can submit a coding agent to track via the leaderboard. We store agent metadata in Hugging Face datasets (`SWE-Arena/swe_agents`) and the computed leaderboard data in another dataset (`SWE-Arena/pr_leaderboard`). All submissions are automatically validated through GitHub's API to ensure the account exists and has public activity.
63
 
64
  ## Using the Leaderboard
65
 
66
  ### Just Browsing?
67
  Head to the Leaderboard tab where you'll find:
68
- - **Searchable table**: Search by agent name or organization
69
  - **Filterable columns**: Filter by acceptance rate to find top performers
70
  - **Monthly charts**: Scroll down to see acceptance rate trends and PR activity over time
71
 
 
59
  The leaderboard refreshes automatically every day at 12:00 AM UTC.
60
 
61
  **Community Submissions**
62
+ Anyone can submit a coding agent to track via the leaderboard. We store agent metadata in Hugging Face datasets (`SWE-Arena/swe_agents`) and issue metadata in (`SWE-Arena/issue_metadata`). The leaderboard is dynamically constructed from the issue metadata. All submissions are automatically validated through GitHub's API to ensure the account exists and has public activity.
63
 
64
  ## Using the Leaderboard
65
 
66
  ### Just Browsing?
67
  Head to the Leaderboard tab where you'll find:
68
+ - **Searchable table**: Search by agent name or website
69
  - **Filterable columns**: Filter by acceptance rate to find top performers
70
  - **Monthly charts**: Scroll down to see acceptance rate trends and PR activity over time
71
 
app.py CHANGED
@@ -44,16 +44,14 @@ else:
44
  DEBUG_MODE = os.getenv('DEBUG_MODE', 'False').lower() in ('true', '1', 'yes')
45
 
46
  # In-memory cache for debug mode (data persists during session but NOT saved to HF)
47
- DEBUG_LEADERBOARD_CACHE = {}
48
  DEBUG_PR_METADATA_CACHE = defaultdict(list)
49
 
50
  AGENTS_REPO = "SWE-Arena/swe_agents" # HuggingFace dataset for agent metadata
51
- LEADERBOARD_REPO = "SWE-Arena/pr_leaderboard"
52
  PR_METADATA_REPO = "SWE-Arena/pr_metadata" # HuggingFace dataset for PR metadata
53
 
54
  LEADERBOARD_COLUMNS = [
55
  ("Agent Name", "string"),
56
- ("Organization", "string"),
57
  ("Total PRs", "number"),
58
  ("Merged PRs", "number"),
59
  ("Acceptance Rate (%)", "number"),
@@ -1178,34 +1176,6 @@ def load_agents_from_hf():
1178
  return None
1179
 
1180
 
1181
- def load_leaderboard_dataset():
1182
- """Load leaderboard data from HuggingFace dataset for current year.
1183
- In debug mode, loads from in-memory cache if available."""
1184
- # In debug mode, check in-memory cache first
1185
- if DEBUG_MODE and DEBUG_LEADERBOARD_CACHE:
1186
- print(f"🐛 DEBUG MODE: Loading leaderboard from in-memory cache ({len(DEBUG_LEADERBOARD_CACHE)} entries)")
1187
- return list(DEBUG_LEADERBOARD_CACHE.values())
1188
-
1189
- try:
1190
- year = datetime.now().year
1191
- filename = f"{year}.csv"
1192
-
1193
- # Try to download the CSV file for current year
1194
- file_path = hf_hub_download(
1195
- repo_id=LEADERBOARD_REPO,
1196
- filename=filename,
1197
- repo_type="dataset"
1198
- )
1199
-
1200
- # Load CSV into list of dicts
1201
- df = pd.read_csv(file_path)
1202
- data = df.to_dict('records')
1203
- print(f"✓ Loaded {len(data)} entries from {filename}")
1204
- return data
1205
-
1206
- except Exception as e:
1207
- print(f"Could not load leaderboard dataset for year {datetime.now().year}: {str(e)}")
1208
- return None
1209
 
1210
 
1211
  def get_hf_token():
@@ -1297,56 +1267,6 @@ def save_agent_to_hf(data):
1297
  return False
1298
 
1299
 
1300
- def save_leaderboard_to_hf(cache_dict):
1301
- """Save complete leaderboard to HuggingFace dataset as CSV.
1302
- In debug mode, saves to in-memory cache only."""
1303
- # Skip saving in debug mode - use in-memory cache instead
1304
- if DEBUG_MODE:
1305
- global DEBUG_LEADERBOARD_CACHE
1306
- # Filter out agents with zero total PRs
1307
- filtered_cache_dict = {k: v for k, v in cache_dict.items() if v.get('total_prs', 0) > 0}
1308
- DEBUG_LEADERBOARD_CACHE = filtered_cache_dict.copy()
1309
- data_list = dict_to_cache(filtered_cache_dict)
1310
- print(f"🐛 DEBUG MODE: Saved to in-memory cache only ({len(data_list)} entries) - NOT saved to HuggingFace")
1311
- return True
1312
-
1313
- try:
1314
- token = get_hf_token()
1315
- if not token:
1316
- raise Exception("No HuggingFace token found. Please set HF_TOKEN in your Space settings.")
1317
-
1318
- # Filter out agents with zero total PRs
1319
- filtered_cache_dict = {k: v for k, v in cache_dict.items() if v.get('total_prs', 0) > 0}
1320
- # Convert to DataFrame
1321
- data_list = dict_to_cache(filtered_cache_dict)
1322
- df = pd.DataFrame(data_list)
1323
-
1324
- # Save to CSV with year as filename
1325
- year = datetime.now().year
1326
- filename = f"{year}.csv"
1327
- df.to_csv(filename, index=False)
1328
-
1329
- try:
1330
- # Upload to HuggingFace
1331
- api = HfApi()
1332
- upload_with_retry(
1333
- api=api,
1334
- path_or_fileobj=filename,
1335
- path_in_repo=filename,
1336
- repo_id=LEADERBOARD_REPO,
1337
- repo_type="dataset",
1338
- token=token
1339
- )
1340
- print(f"✓ Saved leaderboard to HuggingFace as {filename} ({len(data_list)} entries)")
1341
- return True
1342
- finally:
1343
- # Always clean up local file, even if upload fails
1344
- if os.path.exists(filename):
1345
- os.remove(filename)
1346
-
1347
- except Exception as e:
1348
- print(f"✗ Error saving leaderboard: {str(e)}")
1349
- return False
1350
 
1351
 
1352
  # =============================================================================
@@ -1436,7 +1356,7 @@ def update_all_agents_incremental():
1436
  # Merge metadata with stats
1437
  cache_dict[identifier] = {
1438
  'agent_name': agent_name,
1439
- 'organization': agent.get('organization', 'Unknown'),
1440
  'github_identifier': identifier,
1441
  **stats
1442
  }
@@ -1485,7 +1405,7 @@ def construct_leaderboard_from_metadata():
1485
 
1486
  cache_dict[identifier] = {
1487
  'agent_name': agent_name,
1488
- 'organization': agent.get('organization', 'Unknown'),
1489
  'github_identifier': identifier,
1490
  **stats
1491
  }
@@ -1496,7 +1416,7 @@ def construct_leaderboard_from_metadata():
1496
  def initialize_data():
1497
  """
1498
  Initialize data on application startup.
1499
- Priority: 1) Leaderboard dataset ({year}.csv), 2) PR metadata (if available), 3) Full GitHub mining
1500
 
1501
  In DEBUG MODE:
1502
  - If no data available, automatically mine up to 10 PRs per query per agent
@@ -1506,26 +1426,15 @@ def initialize_data():
1506
 
1507
  year = datetime.now().year
1508
 
1509
- # STEP 1: Try loading existing leaderboard CSV from SWE-Arena/pr_leaderboard
1510
- print(f"STEP 1: Checking for {year}.csv in SWE-Arena/pr_leaderboard...")
1511
- leaderboard_data = load_leaderboard_dataset()
1512
- if leaderboard_data:
1513
- print(f"✓ Found and loaded {year}.csv from leaderboard repository")
1514
- print("✓ Initialized from leaderboard dataset")
1515
- return
1516
-
1517
- print(f" {year}.csv not found in leaderboard repository")
1518
-
1519
- # STEP 2: Try constructing from PR metadata in SWE-Arena/pr_metadata (fast, memory-efficient)
1520
- print(f"STEP 2: Checking SWE-Arena/pr_metadata for existing data...")
1521
  try:
1522
  cache_dict = construct_leaderboard_from_metadata()
1523
  # Check if there's actually meaningful data (at least one agent with PRs)
1524
  has_data = any(entry.get('total_prs', 0) > 0 for entry in cache_dict.values())
1525
  if cache_dict and has_data:
1526
  print(f"✓ Found PR metadata in pr_metadata repository")
1527
- save_leaderboard_to_hf(cache_dict)
1528
- print("✓ Initialized from PR metadata and saved as CSV")
1529
  return
1530
  else:
1531
  print(" No meaningful PR metadata found in pr_metadata repository")
@@ -1540,10 +1449,7 @@ def initialize_data():
1540
  print(f"✓ Loaded {len(agents)} agents from HuggingFace")
1541
  print("⛏️ Mining GitHub data in debug mode (limited to 10 PRs per query)...")
1542
  cache_dict = update_all_agents_incremental()
1543
- if cache_dict:
1544
- # In debug mode, this won't actually save to HF
1545
- save_leaderboard_to_hf(cache_dict)
1546
- print("✓ Debug mining complete (data NOT saved to HuggingFace)")
1547
  return
1548
  else:
1549
  print("⚠️ No agents found. Waiting for first submission...")
@@ -1555,8 +1461,6 @@ def initialize_data():
1555
  print(f"✓ Loaded {len(agents)} agents from HuggingFace")
1556
  print("⛏️ Mining GitHub data (this may take a while)...")
1557
  cache_dict = update_all_agents_incremental()
1558
- if cache_dict:
1559
- save_leaderboard_to_hf(cache_dict)
1560
  return
1561
 
1562
  # No data available
@@ -1689,25 +1593,25 @@ def create_monthly_metrics_plot():
1689
 
1690
  def get_leaderboard_dataframe():
1691
  """
1692
- Load leaderboard data from HuggingFace and convert to pandas DataFrame for display.
1693
  Returns formatted DataFrame sorted by acceptance rate.
1694
  """
1695
- # Load leaderboard data from HuggingFace
1696
- leaderboard_data = load_leaderboard_dataset()
1697
 
1698
- if not leaderboard_data:
1699
  # Return empty DataFrame with correct columns if no data
1700
  column_names = [col[0] for col in LEADERBOARD_COLUMNS]
1701
  return pd.DataFrame(columns=column_names)
1702
 
1703
  rows = []
1704
- for data in leaderboard_data:
1705
  # Filter out agents with zero total PRs
1706
  if data.get('total_prs', 0) > 0:
1707
  # Only include display-relevant fields
1708
  rows.append([
1709
  data.get('agent_name', 'Unknown'),
1710
- data.get('organization', 'Unknown'),
1711
  data.get('total_prs', 0),
1712
  data.get('merged', 0),
1713
  data.get('acceptance_rate', 0.0),
@@ -1791,21 +1695,6 @@ def submit_agent(identifier, agent_name, organization, description, website):
1791
  # Save metadata to HuggingFace
1792
  save_pr_metadata_to_hf(metadata_list, identifier)
1793
 
1794
- # Calculate stats from metadata
1795
- stats = calculate_pr_stats_from_metadata(metadata_list)
1796
-
1797
- # Load current leaderboard
1798
- leaderboard_data = load_leaderboard_dataset()
1799
- if not leaderboard_data:
1800
- leaderboard_data = []
1801
-
1802
- # Convert to dict for easy updating
1803
- cache_dict = {entry['github_identifier']: entry for entry in leaderboard_data}
1804
- cache_dict[identifier] = {**submission, **stats}
1805
-
1806
- # Save to HuggingFace
1807
- save_leaderboard_to_hf(cache_dict)
1808
-
1809
  return f"✅ Successfully submitted {agent_name}!", get_leaderboard_dataframe(), create_monthly_metrics_plot()
1810
 
1811
  except Exception as e:
@@ -1873,15 +1762,6 @@ def daily_update_task():
1873
  print(f" PRs updated (closed/merged): {total_updated}")
1874
  print(f"{'='*80}")
1875
 
1876
- # Reconstruct leaderboard from all stored metadata
1877
- print(f"\n📈 Rebuilding leaderboard from refreshed data...")
1878
- cache_dict = construct_leaderboard_from_metadata()
1879
-
1880
- if cache_dict:
1881
- # Save leaderboard
1882
- save_leaderboard_to_hf(cache_dict)
1883
- print("✓ Leaderboard updated successfully")
1884
-
1885
  print(f"\n✅ Daily update completed at {datetime.now(timezone.utc).isoformat()}")
1886
 
1887
  except Exception as e:
@@ -1943,7 +1823,7 @@ with gr.Blocks(title="SWE Agent PR Leaderboard", theme=gr.themes.Soft()) as app:
1943
  leaderboard_table = Leaderboard(
1944
  value=get_leaderboard_dataframe(),
1945
  datatype=LEADERBOARD_COLUMNS,
1946
- search_columns=["Agent Name", "Organization"],
1947
  filter_columns=["Acceptance Rate (%)"]
1948
  )
1949
 
 
44
  DEBUG_MODE = os.getenv('DEBUG_MODE', 'False').lower() in ('true', '1', 'yes')
45
 
46
  # In-memory cache for debug mode (data persists during session but NOT saved to HF)
 
47
  DEBUG_PR_METADATA_CACHE = defaultdict(list)
48
 
49
  AGENTS_REPO = "SWE-Arena/swe_agents" # HuggingFace dataset for agent metadata
 
50
  PR_METADATA_REPO = "SWE-Arena/pr_metadata" # HuggingFace dataset for PR metadata
51
 
52
  LEADERBOARD_COLUMNS = [
53
  ("Agent Name", "string"),
54
+ ("Website", "string"),
55
  ("Total PRs", "number"),
56
  ("Merged PRs", "number"),
57
  ("Acceptance Rate (%)", "number"),
 
1176
  return None
1177
 
1178
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1179
 
1180
 
1181
  def get_hf_token():
 
1267
  return False
1268
 
1269
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1270
 
1271
 
1272
  # =============================================================================
 
1356
  # Merge metadata with stats
1357
  cache_dict[identifier] = {
1358
  'agent_name': agent_name,
1359
+ 'website': agent.get('website', 'Unknown'),
1360
  'github_identifier': identifier,
1361
  **stats
1362
  }
 
1405
 
1406
  cache_dict[identifier] = {
1407
  'agent_name': agent_name,
1408
+ 'website': agent.get('website', 'Unknown'),
1409
  'github_identifier': identifier,
1410
  **stats
1411
  }
 
1416
  def initialize_data():
1417
  """
1418
  Initialize data on application startup.
1419
+ Constructs leaderboard from PR metadata only.
1420
 
1421
  In DEBUG MODE:
1422
  - If no data available, automatically mine up to 10 PRs per query per agent
 
1426
 
1427
  year = datetime.now().year
1428
 
1429
+ # Try constructing from PR metadata in SWE-Arena/pr_metadata (fast, memory-efficient)
1430
+ print(f"Checking SWE-Arena/pr_metadata for existing data...")
 
 
 
 
 
 
 
 
 
 
1431
  try:
1432
  cache_dict = construct_leaderboard_from_metadata()
1433
  # Check if there's actually meaningful data (at least one agent with PRs)
1434
  has_data = any(entry.get('total_prs', 0) > 0 for entry in cache_dict.values())
1435
  if cache_dict and has_data:
1436
  print(f"✓ Found PR metadata in pr_metadata repository")
1437
+ print("✓ Initialized from PR metadata")
 
1438
  return
1439
  else:
1440
  print(" No meaningful PR metadata found in pr_metadata repository")
 
1449
  print(f"✓ Loaded {len(agents)} agents from HuggingFace")
1450
  print("⛏️ Mining GitHub data in debug mode (limited to 10 PRs per query)...")
1451
  cache_dict = update_all_agents_incremental()
1452
+ print("✓ Debug mining complete (data NOT saved to HuggingFace)")
 
 
 
1453
  return
1454
  else:
1455
  print("⚠️ No agents found. Waiting for first submission...")
 
1461
  print(f"✓ Loaded {len(agents)} agents from HuggingFace")
1462
  print("⛏️ Mining GitHub data (this may take a while)...")
1463
  cache_dict = update_all_agents_incremental()
 
 
1464
  return
1465
 
1466
  # No data available
 
1593
 
1594
  def get_leaderboard_dataframe():
1595
  """
1596
+ Construct leaderboard data from PR metadata and convert to pandas DataFrame for display.
1597
  Returns formatted DataFrame sorted by acceptance rate.
1598
  """
1599
+ # Construct leaderboard from PR metadata
1600
+ cache_dict = construct_leaderboard_from_metadata()
1601
 
1602
+ if not cache_dict:
1603
  # Return empty DataFrame with correct columns if no data
1604
  column_names = [col[0] for col in LEADERBOARD_COLUMNS]
1605
  return pd.DataFrame(columns=column_names)
1606
 
1607
  rows = []
1608
+ for identifier, data in cache_dict.items():
1609
  # Filter out agents with zero total PRs
1610
  if data.get('total_prs', 0) > 0:
1611
  # Only include display-relevant fields
1612
  rows.append([
1613
  data.get('agent_name', 'Unknown'),
1614
+ data.get('website', 'Unknown'),
1615
  data.get('total_prs', 0),
1616
  data.get('merged', 0),
1617
  data.get('acceptance_rate', 0.0),
 
1695
  # Save metadata to HuggingFace
1696
  save_pr_metadata_to_hf(metadata_list, identifier)
1697
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1698
  return f"✅ Successfully submitted {agent_name}!", get_leaderboard_dataframe(), create_monthly_metrics_plot()
1699
 
1700
  except Exception as e:
 
1762
  print(f" PRs updated (closed/merged): {total_updated}")
1763
  print(f"{'='*80}")
1764
 
 
 
 
 
 
 
 
 
 
1765
  print(f"\n✅ Daily update completed at {datetime.now(timezone.utc).isoformat()}")
1766
 
1767
  except Exception as e:
 
1823
  leaderboard_table = Leaderboard(
1824
  value=get_leaderboard_dataframe(),
1825
  datatype=LEADERBOARD_COLUMNS,
1826
+ search_columns=["Agent Name", "Website"],
1827
  filter_columns=["Acceptance Rate (%)"]
1828
  )
1829
 
msr.py CHANGED
@@ -64,11 +64,9 @@ else:
64
  # Constants (match app.py)
65
  # =============================================================================
66
 
67
- DEBUG_LEADERBOARD_CACHE = {}
68
  DEBUG_PR_METADATA_CACHE = defaultdict(list)
69
 
70
  AGENTS_REPO = "SWE-Arena/swe_agents"
71
- LEADERBOARD_REPO = "SWE-Arena/pr_leaderboard"
72
  PR_METADATA_REPO = "SWE-Arena/pr_metadata"
73
 
74
 
@@ -622,45 +620,6 @@ def get_already_mined_dates(agent_identifier, n_months=6):
622
  return set()
623
 
624
 
625
- def save_leaderboard_to_hf(cache_dict):
626
- if DEBUG_MODE:
627
- global DEBUG_LEADERBOARD_CACHE
628
- # Filter out agents with zero total PRs
629
- filtered_cache_dict = {k: v for k, v in cache_dict.items() if v.get('total_prs', 0) > 0}
630
- DEBUG_LEADERBOARD_CACHE = filtered_cache_dict.copy()
631
- data_list = dict_to_cache(filtered_cache_dict)
632
- print(f"🐛 DEBUG MODE: Saved to in-memory cache only ({len(data_list)} entries) - NOT saved to HuggingFace")
633
- return True
634
- try:
635
- token = get_hf_token()
636
- if not token:
637
- raise Exception("No HuggingFace token found. Please set HF_TOKEN in your environment.")
638
- # Filter out agents with zero total PRs
639
- filtered_cache_dict = {k: v for k, v in cache_dict.items() if v.get('total_prs', 0) > 0}
640
- data_list = dict_to_cache(filtered_cache_dict)
641
- df = pd.DataFrame(data_list)
642
- year = datetime.now().year
643
- filename = f"{year}.csv"
644
- df.to_csv(filename, index=False)
645
- api = HfApi()
646
- try:
647
- upload_with_retry(
648
- api=api,
649
- path_or_fileobj=filename,
650
- path_in_repo=filename,
651
- repo_id=LEADERBOARD_REPO,
652
- repo_type="dataset",
653
- token=token
654
- )
655
- print(f"✓ Saved leaderboard to HuggingFace as {filename} ({len(data_list)} entries)")
656
- return True
657
- finally:
658
- # Always clean up local file, even if upload fails
659
- if os.path.exists(filename):
660
- os.remove(filename)
661
- except Exception as e:
662
- print(f"✗ Error saving leaderboard: {str(e)}")
663
- return False
664
 
665
 
666
  def calculate_pr_stats_from_metadata(metadata_list):
@@ -745,7 +704,7 @@ def update_all_agents_incremental():
745
  stats = calculate_pr_stats_from_metadata(agent_metadata)
746
  cache_dict[identifier] = {
747
  'agent_name': agent_name,
748
- 'organization': agent.get('organization', 'Unknown'),
749
  'github_identifier': identifier,
750
  **stats
751
  }
@@ -761,8 +720,6 @@ def update_all_agents_incremental():
761
  def run_once():
762
  print("\n🚀 Immediate mining run started")
763
  cache_dict = update_all_agents_incremental()
764
- if cache_dict:
765
- save_leaderboard_to_hf(cache_dict)
766
  print("✅ Immediate mining run completed\n")
767
 
768
 
 
64
  # Constants (match app.py)
65
  # =============================================================================
66
 
 
67
  DEBUG_PR_METADATA_CACHE = defaultdict(list)
68
 
69
  AGENTS_REPO = "SWE-Arena/swe_agents"
 
70
  PR_METADATA_REPO = "SWE-Arena/pr_metadata"
71
 
72
 
 
620
  return set()
621
 
622
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
623
 
624
 
625
  def calculate_pr_stats_from_metadata(metadata_list):
 
704
  stats = calculate_pr_stats_from_metadata(agent_metadata)
705
  cache_dict[identifier] = {
706
  'agent_name': agent_name,
707
+ 'website': agent.get('website', 'Unknown'),
708
  'github_identifier': identifier,
709
  **stats
710
  }
 
720
  def run_once():
721
  print("\n🚀 Immediate mining run started")
722
  cache_dict = update_all_agents_incremental()
 
 
723
  print("✅ Immediate mining run completed\n")
724
 
725