zhiminy commited on
Commit
9621022
·
1 Parent(s): c27bef0

update daily msr

Browse files
Files changed (1) hide show
  1. app.py +96 -304
app.py CHANGED
@@ -465,27 +465,22 @@ def extract_pr_metadata(pr):
465
  }
466
 
467
 
468
- def fetch_all_prs_metadata(identifier, agent_name, token=None, start_from_date=None, reviewexclude_dates=None):
469
  """
470
- Fetch pull requests associated with a GitHub user or bot for the past 6 months.
471
- Returns lightweight metadata instead of full PR objects.
472
-
473
- This function employs time-based partitioning to navigate GitHub's 1000-result limit per query.
474
- It searches using multiple query patterns:
475
- - is:pr author:{identifier} (PRs authored by the bot)
476
- - is:pr "co-authored-by: {identifier}" (PRs with commits co-authored by the bot)
477
- - is:pr head:{identifier}/ (PRs with branch names starting with the bot identifier)
478
 
479
  Args:
480
  identifier: GitHub username or bot identifier
481
  agent_name: Human-readable name of the agent for metadata purposes
482
  token: GitHub API token for authentication
483
- start_from_date: Only fetch PRs created after this date (for incremental updates)
484
- exclude_dates: Set of date objects to exclude from mining (dates that have already been processed)
485
 
486
  Returns:
487
- List of dictionaries containing minimal PR metadata
488
  """
 
 
 
489
  headers = {'Authorization': f'token {token}'} if token else {}
490
 
491
  # Debug mode: limit PR retrieval for testing
@@ -508,27 +503,18 @@ def fetch_all_prs_metadata(identifier, agent_name, token=None, start_from_date=N
508
  # Use a dict to deduplicate PRs by ID
509
  prs_by_id = {}
510
 
511
- # Define time range: past 6 months only (or from start_from_date if specified)
512
- current_time = datetime.now(timezone.utc)
513
- six_months_ago = current_time - timedelta(days=180) # ~6 months
514
-
515
- if start_from_date:
516
- # Use start_from_date but ensure it's not older than 6 months
517
- start_date = max(start_from_date, six_months_ago)
518
- else:
519
- start_date = six_months_ago
520
-
521
- # End date is current time
522
- end_date = current_time
523
 
524
  for query_pattern in query_patterns:
525
  print(f"\n🔍 Searching with query: {query_pattern}")
526
- print(f" Time range: {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}")
527
 
528
  pattern_start_time = time.time()
529
  initial_count = len(prs_by_id)
530
 
531
- # Fetch with time partitioning
532
  prs_found = fetch_prs_with_time_partition(
533
  query_pattern,
534
  start_date,
@@ -550,47 +536,18 @@ def fetch_all_prs_metadata(identifier, agent_name, token=None, start_from_date=N
550
  # Convert to lightweight metadata
551
  all_prs = list(prs_by_id.values())
552
 
553
- # Filter out PRs from excluded dates if specified
554
- if exclude_dates:
555
- filtered_prs = []
556
- excluded_count = 0
557
- for pr in all_prs:
558
- created_at = pr.get('created_at')
559
- if created_at:
560
- try:
561
- dt = datetime.fromisoformat(created_at.replace('Z', '+00:00'))
562
- pr_date = dt.date()
563
- if pr_date not in exclude_dates:
564
- filtered_prs.append(pr)
565
- else:
566
- excluded_count += 1
567
- except Exception:
568
- filtered_prs.append(pr) # Keep PRs with unparseable dates
569
- else:
570
- filtered_prs.append(pr) # Keep PRs without created_at
571
-
572
- if excluded_count > 0:
573
- print(f" ⏭️ Skipped {excluded_count} PRs from already-mined dates")
574
- all_prs = filtered_prs
575
-
576
  if DEBUG_MODE:
577
- print(f"\n✅ COMPLETE (DEBUG MODE): Found {len(all_prs)} unique PRs for {identifier}")
578
  print(f" Note: In production mode, this would fetch ALL PRs")
579
  else:
580
- print(f"\n✅ COMPLETE: Found {len(all_prs)} unique PRs for {identifier}")
581
  print(f"📦 Extracting minimal metadata...")
582
 
583
  metadata_list = [extract_pr_metadata(pr) for pr in all_prs]
584
 
585
- # Calculate memory savings
586
- import sys
587
- original_size = sys.getsizeof(str(all_prs))
588
- metadata_size = sys.getsizeof(str(metadata_list))
589
- savings_pct = ((original_size - metadata_size) / original_size * 100) if original_size > 0 else 0
590
 
591
- print(f"💾 Memory efficiency: {original_size // 1024}KB → {metadata_size // 1024}KB (saved {savings_pct:.1f}%)")
592
 
593
- return metadata_list
594
 
595
 
596
  def calculate_pr_stats_from_metadata(metadata_list):
@@ -1073,59 +1030,6 @@ def get_daily_files_last_n_months(agent_identifier, n_months=6):
1073
  return []
1074
 
1075
 
1076
- def get_already_mined_dates(agent_identifier, n_months=6):
1077
- """
1078
- Get set of dates that have already been mined for an agent.
1079
-
1080
- Args:
1081
- agent_identifier: GitHub identifier of the agent
1082
- n_months: Number of months to look back (default: 6)
1083
-
1084
- Returns:
1085
- Set of date objects (datetime.date) that already have data files
1086
- """
1087
- try:
1088
- api = HfApi()
1089
-
1090
- # Calculate date range
1091
- today = datetime.now(timezone.utc)
1092
- n_months_ago = today - timedelta(days=30 * n_months)
1093
-
1094
- # List all files in the repository
1095
- files = api.list_repo_files(repo_id=PR_METADATA_REPO, repo_type="dataset")
1096
-
1097
- # Filter for files in this agent's folder
1098
- agent_pattern = f"{agent_identifier}/"
1099
- agent_files = [f for f in files if f.startswith(agent_pattern) and f.endswith('.jsonl')]
1100
-
1101
- mined_dates = set()
1102
- for filename in agent_files:
1103
- try:
1104
- # Extract date from filename: [agent_identifier]/YYYY.MM.DD.jsonl
1105
- parts = filename.split('/')
1106
- if len(parts) != 2:
1107
- continue
1108
-
1109
- date_part = parts[1].replace('.jsonl', '') # Get YYYY.MM.DD
1110
- date_components = date_part.split('.')
1111
- if len(date_components) != 3:
1112
- continue
1113
-
1114
- file_year, file_month, file_day = map(int, date_components)
1115
- file_date = datetime(file_year, file_month, file_day, tzinfo=timezone.utc).date()
1116
-
1117
- # Only include dates within the last n_months
1118
- if n_months_ago.date() <= file_date <= today.date():
1119
- mined_dates.add(file_date)
1120
- except Exception as e:
1121
- print(f" Warning: Could not parse date from filename {filename}: {e}")
1122
- continue
1123
-
1124
- return mined_dates
1125
-
1126
- except Exception as e:
1127
- print(f" Warning: Could not get already-mined dates for {agent_identifier}: {str(e)}")
1128
- return set()
1129
 
1130
 
1131
  def fetch_pr_current_status(pr_url, token):
@@ -1432,101 +1336,98 @@ def save_agent_to_hf(data):
1432
 
1433
  def update_all_agents_incremental():
1434
  """
1435
- Memory-efficient incremental update of PR statistics for all agents.
1436
 
1437
  Strategy:
1438
- 1. For each agent, load existing data from SWE-Arena/pr_metadata
1439
- 2. Identify already-mined dates (based on filename: YYYY.MM.DD.jsonl)
1440
- 3. Only fetch PRs from dates that haven't been mined yet (within last 6 months)
1441
- 4. If no data exists at all, mine everything from scratch
1442
- 5. Store minimal metadata (not full PR objects) to avoid storage limits
1443
- 6. Construct leaderboard from ALL stored metadata (last 6 months)
1444
-
1445
- Returns dictionary of all agent data with current stats.
1446
  """
1447
- token = get_github_token()
 
 
1448
 
1449
- # Load agent metadata from HuggingFace
1450
- agents = load_agents_from_hf()
1451
- if not agents:
1452
- print("No agents found in HuggingFace dataset")
1453
- return {}
1454
 
1455
- cache_dict = {}
 
 
 
 
1456
 
1457
- # Update each agent
1458
- for agent in agents:
1459
- identifier = agent.get('github_identifier')
1460
- agent_name = agent.get('agent_name', 'Unknown')
1461
 
1462
- if not identifier:
1463
- print(f"Warning: Skipping agent without identifier: {agent}")
1464
- continue
 
1465
 
1466
- try:
1467
- print(f"\n{'='*80}")
1468
- print(f"Processing: {agent_name} ({identifier})")
1469
- print(f"{'='*80}")
1470
-
1471
- # Get already-mined dates for this agent (last 6 months)
1472
- already_mined_dates = get_already_mined_dates(identifier, n_months=6)
1473
-
1474
- if already_mined_dates:
1475
- print(f"📅 Found {len(already_mined_dates)} already-mined dates")
1476
- print(f" Re-mining ALL dates (including existing) to update metadata...")
1477
- # Re-mine ALL PRs (do NOT exclude already-mined dates)
1478
- # This ensures metadata like merged_at is updated even if day file exists
1479
- new_metadata = fetch_all_prs_metadata(
 
 
 
1480
  identifier,
1481
- agent_name,
1482
- token,
1483
- start_from_date=None, # Use full 6-month range
1484
- exclude_dates=None # Re-mine ALL dates (no exclusions)
1485
  )
1486
- else:
1487
- print(f"📅 No existing data found. Mining everything from scratch...")
1488
- # Mine everything from scratch (full 6-month range)
1489
- new_metadata = fetch_all_prs_metadata(
 
 
1490
  identifier,
1491
  agent_name,
1492
  token,
1493
- start_from_date=None
1494
  )
1495
 
1496
- if new_metadata:
1497
- # Save new metadata to HuggingFace (organized by agent_identifier/YYYY.MM.DD.jsonl)
1498
- print(f"💾 Saving {len(new_metadata)} new PR records...")
1499
- save_pr_metadata_to_hf(new_metadata, identifier)
1500
- else:
1501
- print(f" No new PRs to save")
1502
-
1503
- # Load ALL metadata to calculate stats (aggregates entire last 6 months)
1504
- print(f"📊 Calculating statistics from ALL stored metadata (last 6 months)...")
1505
- all_metadata = load_pr_metadata()
1506
 
1507
- # Filter for this specific agent
1508
- agent_metadata = [pr for pr in all_metadata if pr.get('agent_identifier') == identifier]
1509
 
1510
- # Calculate stats from metadata
1511
- stats = calculate_pr_stats_from_metadata(agent_metadata)
1512
-
1513
- # Merge metadata with stats
1514
- cache_dict[identifier] = {
1515
- 'agent_name': agent_name,
1516
- 'website': agent.get('website', 'Unknown'),
1517
- 'github_identifier': identifier,
1518
- **stats
1519
- }
1520
 
1521
- print(f"✓ Updated {identifier}: {stats['total_prs']} PRs, {stats['acceptance_rate']}% acceptance")
 
 
 
 
 
1522
 
1523
- except Exception as e:
1524
- print(f"✗ Error updating {identifier}: {str(e)}")
1525
- import traceback
1526
- traceback.print_exc()
1527
- continue
1528
 
1529
- return cache_dict
 
 
 
1530
 
1531
 
1532
  def construct_leaderboard_from_metadata():
@@ -1568,58 +1469,6 @@ def construct_leaderboard_from_metadata():
1568
  return cache_dict
1569
 
1570
 
1571
- def initialize_data():
1572
- """
1573
- Initialize data on application startup.
1574
- Constructs leaderboard from PR metadata only.
1575
-
1576
- In DEBUG MODE:
1577
- - If no data available, automatically mine up to 10 PRs per query per agent
1578
- - Does NOT save to HuggingFace datasets
1579
- """
1580
- print("🚀 Initializing leaderboard data...")
1581
-
1582
- # Try constructing from PR metadata in SWE-Arena/pr_metadata (fast, memory-efficient)
1583
- print(f"Checking SWE-Arena/pr_metadata for existing data...")
1584
- try:
1585
- cache_dict = construct_leaderboard_from_metadata()
1586
- # Check if there's actually meaningful data (at least one agent with PRs)
1587
- has_data = any(entry.get('total_prs', 0) > 0 for entry in cache_dict.values())
1588
- if cache_dict and has_data:
1589
- print(f"✓ Found PR metadata in pr_metadata repository")
1590
- print("✓ Initialized from PR metadata")
1591
- return
1592
- else:
1593
- print(" No meaningful PR metadata found in pr_metadata repository")
1594
- except Exception as e:
1595
- print(f" Could not construct from metadata: {e}")
1596
-
1597
- # If in debug mode and no data available, mine immediately
1598
- if DEBUG_MODE:
1599
- print("\n🐛 DEBUG MODE: No data available, mining immediately (up to 10 PRs per query per agent)...")
1600
- agents = load_agents_from_hf()
1601
- if agents:
1602
- print(f"✓ Loaded {len(agents)} agents from HuggingFace")
1603
- print("⛏️ Mining GitHub data in debug mode (limited to 10 PRs per query)...")
1604
- cache_dict = update_all_agents_incremental()
1605
- print("✓ Debug mining complete (data NOT saved to HuggingFace)")
1606
- return
1607
- else:
1608
- print("⚠️ No agents found. Waiting for first submission...")
1609
- return
1610
-
1611
- # Production mode: Fallback to full incremental mining from GitHub
1612
- agents = load_agents_from_hf()
1613
- if agents:
1614
- print(f"✓ Loaded {len(agents)} agents from HuggingFace")
1615
- print("⛏️ Mining GitHub data (this may take a while)...")
1616
- cache_dict = update_all_agents_incremental()
1617
- return
1618
-
1619
- # No data available
1620
- print("⚠️ No data sources available. Waiting for first submission...")
1621
-
1622
-
1623
  # =============================================================================
1624
  # UI FUNCTIONS
1625
  # =============================================================================
@@ -1792,7 +1641,8 @@ def get_leaderboard_dataframe():
1792
  def submit_agent(identifier, agent_name, organization, description, website):
1793
  """
1794
  Submit a new agent to the leaderboard.
1795
- Validates input, saves submission, and fetches PR metadata (memory-efficient).
 
1796
  """
1797
  # Validate required fields
1798
  if not identifier or not identifier.strip():
@@ -1836,64 +1686,8 @@ def submit_agent(identifier, agent_name, organization, description, website):
1836
  if not save_agent_to_hf(submission):
1837
  return "❌ Failed to save submission", get_leaderboard_dataframe(), create_monthly_metrics_plot()
1838
 
1839
- # Fetch PR metadata immediately (memory-efficient)
1840
- token = get_github_token()
1841
- try:
1842
- print(f"Fetching PR metadata for {agent_name}...")
1843
-
1844
- # Fetch lightweight metadata
1845
- metadata_list = fetch_all_prs_metadata(identifier, agent_name, token)
1846
-
1847
- if metadata_list:
1848
- # Save metadata to HuggingFace
1849
- save_pr_metadata_to_hf(metadata_list, identifier)
1850
-
1851
- return f"✅ Successfully submitted {agent_name}!", get_leaderboard_dataframe(), create_monthly_metrics_plot()
1852
-
1853
- except Exception as e:
1854
- error_msg = f"⚠️ Submitted {agent_name}, but failed to fetch PR data: {str(e)}"
1855
- print(error_msg)
1856
- import traceback
1857
- traceback.print_exc()
1858
- return error_msg, get_leaderboard_dataframe(), create_monthly_metrics_plot()
1859
-
1860
-
1861
- # =============================================================================
1862
- # BACKGROUND TASKS
1863
- # =============================================================================
1864
-
1865
- def daily_update_task():
1866
- """
1867
- Daily scheduled task (runs at 12:00 AM UTC) for regular PR mining.
1868
-
1869
- Strategy:
1870
- 1. Re-mine ALL PRs for all agents within the last 6 months (LEADERBOARD_TIME_FRAME_DAYS)
1871
- 2. Update ALL day files, even if they already exist
1872
- 3. This ensures metadata like 'merged_at' is always current (e.g., PRs merged after initial mining)
1873
-
1874
- This replaces the old refresh_open_prs approach to ensure no stale data.
1875
- """
1876
- print(f"\n{'='*80}")
1877
- print(f"🕛 Daily Regular PR Mining started at {datetime.now(timezone.utc).isoformat()}")
1878
- print(f"{'='*80}")
1879
-
1880
- try:
1881
- # Re-mine all PRs for all agents (will update existing day files)
1882
- print(f"📋 Re-mining all PRs within {LEADERBOARD_TIME_FRAME_DAYS} days for all agents...")
1883
- cache_dict = update_all_agents_incremental()
1884
-
1885
- print(f"\n{'='*80}")
1886
- print(f"📊 Mining Summary:")
1887
- print(f" Total agents processed: {len(cache_dict)}")
1888
- print(f" All PR metadata updated (including existing day files)")
1889
- print(f"{'='*80}")
1890
-
1891
- print(f"\n✅ Daily Regular PR Mining completed at {datetime.now(timezone.utc).isoformat()}")
1892
-
1893
- except Exception as e:
1894
- print(f"✗ Daily mining failed: {str(e)}")
1895
- import traceback
1896
- traceback.print_exc()
1897
 
1898
 
1899
  # =============================================================================
@@ -1922,19 +1716,17 @@ else:
1922
  print(" (Explicitly set via '--no-debug' flag)")
1923
  print()
1924
 
1925
- initialize_data()
1926
-
1927
- # Start APScheduler for daily regular PR mining at 12:00 AM UTC
1928
  scheduler = BackgroundScheduler(timezone="UTC")
1929
  scheduler.add_job(
1930
- daily_update_task,
1931
  trigger=CronTrigger(hour=0, minute=0), # 12:00 AM UTC daily
1932
- id='daily_regular_pr_mining',
1933
- name='Daily Regular PR Mining',
1934
  replace_existing=True
1935
  )
1936
  scheduler.start()
1937
- print("✓ Scheduler started: Daily Regular PR Mining at 12:00 AM UTC")
1938
 
1939
  # Create Gradio interface
1940
  with gr.Blocks(title="SWE Agent PR Leaderboard", theme=gr.themes.Soft()) as app:
 
465
  }
466
 
467
 
468
+ def fetch_daily_prs_metadata(identifier, agent_name, token=None, target_date=None):
469
  """
470
+ Fetch pull requests for a specific date (used for daily incremental updates).
 
 
 
 
 
 
 
471
 
472
  Args:
473
  identifier: GitHub username or bot identifier
474
  agent_name: Human-readable name of the agent for metadata purposes
475
  token: GitHub API token for authentication
476
+ target_date: Date object for which to fetch PRs (defaults to yesterday)
 
477
 
478
  Returns:
479
+ List of dictionaries containing minimal PR metadata for that date
480
  """
481
+ if target_date is None:
482
+ target_date = (datetime.now(timezone.utc) - timedelta(days=1)).date()
483
+
484
  headers = {'Authorization': f'token {token}'} if token else {}
485
 
486
  # Debug mode: limit PR retrieval for testing
 
503
  # Use a dict to deduplicate PRs by ID
504
  prs_by_id = {}
505
 
506
+ # Convert target_date to datetime for API queries
507
+ start_date = datetime.combine(target_date, datetime.min.time()).replace(tzinfo=timezone.utc)
508
+ end_date = datetime.combine(target_date, datetime.max.time()).replace(tzinfo=timezone.utc)
 
 
 
 
 
 
 
 
 
509
 
510
  for query_pattern in query_patterns:
511
  print(f"\n🔍 Searching with query: {query_pattern}")
512
+ print(f" Date: {target_date.strftime('%Y-%m-%d')}")
513
 
514
  pattern_start_time = time.time()
515
  initial_count = len(prs_by_id)
516
 
517
+ # Fetch with time partitioning (for single day)
518
  prs_found = fetch_prs_with_time_partition(
519
  query_pattern,
520
  start_date,
 
536
  # Convert to lightweight metadata
537
  all_prs = list(prs_by_id.values())
538
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
539
  if DEBUG_MODE:
540
+ print(f"\n✅ COMPLETE (DEBUG MODE): Found {len(all_prs)} unique PRs for {identifier} on {target_date}")
541
  print(f" Note: In production mode, this would fetch ALL PRs")
542
  else:
543
+ print(f"\n✅ COMPLETE: Found {len(all_prs)} unique PRs for {identifier} on {target_date}")
544
  print(f"📦 Extracting minimal metadata...")
545
 
546
  metadata_list = [extract_pr_metadata(pr) for pr in all_prs]
547
 
548
+ return metadata_list
 
 
 
 
549
 
 
550
 
 
551
 
552
 
553
  def calculate_pr_stats_from_metadata(metadata_list):
 
1030
  return []
1031
 
1032
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1033
 
1034
 
1035
  def fetch_pr_current_status(pr_url, token):
 
1336
 
1337
  def update_all_agents_incremental():
1338
  """
1339
+ Daily incremental update - refreshes open PRs and fetches new PRs for all agents.
1340
 
1341
  Strategy:
1342
+ 1. Refresh status of all open PRs from the last LEADERBOARD_TIME_FRAME_DAYS - 1 days
1343
+ (to check if any have been merged or closed)
1344
+ 2. Fetch new PRs created yesterday (from 12:00 AM to 11:59:59 PM yesterday)
1345
+ 3. Update the corresponding daily files (YYYY.MM.DD.jsonl)
1346
+ 4. This runs daily to keep data fresh without re-mining everything
 
 
 
1347
  """
1348
+ print(f"\n{'='*80}")
1349
+ print(f"🕛 Daily Incremental PR Mining started at {datetime.now(timezone.utc).isoformat()}")
1350
+ print(f"{'='*80}")
1351
 
1352
+ try:
1353
+ token = get_github_token()
 
 
 
1354
 
1355
+ # Load agent metadata from HuggingFace
1356
+ agents = load_agents_from_hf()
1357
+ if not agents:
1358
+ print("No agents found in HuggingFace dataset")
1359
+ return
1360
 
1361
+ # Calculate yesterday's date
1362
+ yesterday = (datetime.now(timezone.utc) - timedelta(days=1)).date()
1363
+ print(f"\n📅 Daily Incremental Update for {yesterday.strftime('%Y-%m-%d')} for all agents...")
 
1364
 
1365
+ agents_processed = 0
1366
+ total_refreshed = 0
1367
+ total_refreshed_updated = 0
1368
+ total_new_prs = 0
1369
 
1370
+ # Update each agent
1371
+ for agent in agents:
1372
+ identifier = agent.get('github_identifier')
1373
+ agent_name = agent.get('agent_name', 'Unknown')
1374
+
1375
+ if not identifier:
1376
+ print(f"Warning: Skipping agent without identifier: {agent}")
1377
+ continue
1378
+
1379
+ try:
1380
+ print(f"\n{'='*80}")
1381
+ print(f"Processing: {agent_name} ({identifier})")
1382
+ print(f"{'='*80}")
1383
+
1384
+ # STEP 1: Refresh all open PRs from the last LEADERBOARD_TIME_FRAME_DAYS - 1 days
1385
+ print(f"\n🔄 Step 1: Refreshing open PRs (last {LEADERBOARD_TIME_FRAME_DAYS - 1} days)...")
1386
+ refreshed_checked, refreshed_updated = refresh_open_prs_for_agent(
1387
  identifier,
1388
+ token
 
 
 
1389
  )
1390
+ total_refreshed += refreshed_checked
1391
+ total_refreshed_updated += refreshed_updated
1392
+
1393
+ # STEP 2: Fetch new PRs created yesterday (12:00 AM to 11:59:59 PM yesterday)
1394
+ print(f"\n📥 Step 2: Fetching new PRs created on {yesterday.strftime('%Y-%m-%d')} (12:00 AM to 11:59:59 PM)...")
1395
+ new_metadata = fetch_daily_prs_metadata(
1396
  identifier,
1397
  agent_name,
1398
  token,
1399
+ target_date=yesterday
1400
  )
1401
 
1402
+ if new_metadata:
1403
+ # Save new metadata to HuggingFace
1404
+ print(f"💾 Saving {len(new_metadata)} new PRs from {yesterday}...")
1405
+ save_pr_metadata_to_hf(new_metadata, identifier)
1406
+ total_new_prs += len(new_metadata)
1407
+ else:
1408
+ print(f" No new PRs found created on {yesterday}")
 
 
 
1409
 
1410
+ agents_processed += 1
 
1411
 
1412
+ except Exception as e:
1413
+ print(f"✗ Error updating {identifier}: {str(e)}")
1414
+ import traceback
1415
+ traceback.print_exc()
1416
+ continue
 
 
 
 
 
1417
 
1418
+ print(f"\n{'='*80}")
1419
+ print(f"📊 Mining Summary:")
1420
+ print(f" Total agents processed: {agents_processed}")
1421
+ print(f" Open PRs refreshed: {total_refreshed} checked, {total_refreshed_updated} updated")
1422
+ print(f" New PRs added (from yesterday): {total_new_prs}")
1423
+ print(f"{'='*80}")
1424
 
1425
+ print(f"\n✅ Daily Incremental PR Mining completed at {datetime.now(timezone.utc).isoformat()}")
 
 
 
 
1426
 
1427
+ except Exception as e:
1428
+ print(f"✗ Daily mining failed: {str(e)}")
1429
+ import traceback
1430
+ traceback.print_exc()
1431
 
1432
 
1433
  def construct_leaderboard_from_metadata():
 
1469
  return cache_dict
1470
 
1471
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1472
  # =============================================================================
1473
  # UI FUNCTIONS
1474
  # =============================================================================
 
1641
  def submit_agent(identifier, agent_name, organization, description, website):
1642
  """
1643
  Submit a new agent to the leaderboard.
1644
+ Validates input and saves submission.
1645
+ PR data will be populated by the daily incremental update.
1646
  """
1647
  # Validate required fields
1648
  if not identifier or not identifier.strip():
 
1686
  if not save_agent_to_hf(submission):
1687
  return "❌ Failed to save submission", get_leaderboard_dataframe(), create_monthly_metrics_plot()
1688
 
1689
+ success_msg = f"✅ Successfully submitted {agent_name}!\n\nPR data will be populated by the daily incremental update (runs at 12:00 AM UTC)."
1690
+ return success_msg, get_leaderboard_dataframe(), create_monthly_metrics_plot()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1691
 
1692
 
1693
  # =============================================================================
 
1716
  print(" (Explicitly set via '--no-debug' flag)")
1717
  print()
1718
 
1719
+ # Start APScheduler for daily incremental PR mining at 12:00 AM UTC
 
 
1720
  scheduler = BackgroundScheduler(timezone="UTC")
1721
  scheduler.add_job(
1722
+ update_all_agents_incremental,
1723
  trigger=CronTrigger(hour=0, minute=0), # 12:00 AM UTC daily
1724
+ id='daily_incremental_pr_mining',
1725
+ name='Daily Incremental PR Mining',
1726
  replace_existing=True
1727
  )
1728
  scheduler.start()
1729
+ print("✓ Scheduler started: Daily Incremental PR Mining at 12:00 AM UTC")
1730
 
1731
  # Create Gradio interface
1732
  with gr.Blocks(title="SWE Agent PR Leaderboard", theme=gr.themes.Soft()) as app: