zhiminy commited on
Commit
35588b6
·
1 Parent(s): ea8e7bd
Files changed (2) hide show
  1. app.py +221 -80
  2. msr.py +194 -34
app.py CHANGED
@@ -347,7 +347,7 @@ def extract_pr_metadata(pr):
347
  }
348
 
349
 
350
- def fetch_all_prs_metadata(identifier, agent_name, token=None, start_from_date=None, year=None):
351
  """
352
  Fetch pull requests associated with a GitHub user/bot for the past 6 months.
353
  Returns lightweight metadata instead of full PR objects.
@@ -364,6 +364,7 @@ def fetch_all_prs_metadata(identifier, agent_name, token=None, start_from_date=N
364
  token: GitHub API token
365
  start_from_date: Only fetch PRs created after this date (for incremental updates)
366
  year: Year parameter (deprecated, kept for compatibility but not used)
 
367
 
368
  Returns:
369
  List of minimal PR metadata dictionaries
@@ -427,6 +428,30 @@ def fetch_all_prs_metadata(identifier, agent_name, token=None, start_from_date=N
427
 
428
  # Convert to lightweight metadata
429
  all_prs = list(prs_by_id.values())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
430
  if DEBUG_MODE:
431
  print(f"\n✅ COMPLETE (DEBUG MODE): Found {len(all_prs)} unique PRs for {identifier}")
432
  print(f" Note: In production mode, this would fetch ALL PRs")
@@ -675,19 +700,21 @@ def save_pr_metadata_to_hf(metadata_list, agent_identifier):
675
  # Save locally
676
  save_jsonl(local_filename, merged_metadata)
677
 
678
- # Upload to HuggingFace with folder path
679
- api.upload_file(
680
- path_or_fileobj=local_filename,
681
- path_in_repo=filename,
682
- repo_id=PR_METADATA_REPO,
683
- repo_type="dataset",
684
- token=token
685
- )
686
-
687
- # Clean up local file
688
- os.remove(local_filename)
689
-
690
- print(f" ✓ Saved {len(merged_metadata)} total PRs to {filename}")
 
 
691
 
692
  return True
693
 
@@ -891,6 +918,61 @@ def get_daily_files_last_n_months(agent_identifier, n_months=6):
891
  return []
892
 
893
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
894
  def fetch_pr_current_status(pr_url, token):
895
  """
896
  Fetch the current status of a single PR from GitHub API.
@@ -1024,19 +1106,22 @@ def refresh_open_prs_for_agent(agent_identifier, token):
1024
  # Save locally
1025
  save_jsonl(local_filename, updated_prs)
1026
 
1027
- # Upload back to HuggingFace
1028
- api = HfApi()
1029
- api.upload_file(
1030
- path_or_fileobj=local_filename,
1031
- path_in_repo=filename,
1032
- repo_id=PR_METADATA_REPO,
1033
- repo_type="dataset",
1034
- token=get_hf_token()
1035
- )
1036
-
1037
- # Clean up local file
1038
- os.remove(local_filename)
1039
- print(f" 💾 Updated {filename}")
 
 
 
1040
 
1041
  except Exception as e:
1042
  print(f" Warning: Could not process {filename}: {str(e)}")
@@ -1131,6 +1216,49 @@ def get_hf_token():
1131
  return token
1132
 
1133
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1134
  def save_agent_to_hf(data):
1135
  """Save a new agent to HuggingFace dataset as {identifier}.json in root."""
1136
  try:
@@ -1147,20 +1275,22 @@ def save_agent_to_hf(data):
1147
  with open(filename, 'w') as f:
1148
  json.dump(data, f, indent=2)
1149
 
1150
- # Upload to HuggingFace (root directory)
1151
- api.upload_file(
1152
- path_or_fileobj=filename,
1153
- path_in_repo=filename,
1154
- repo_id=AGENTS_REPO,
1155
- repo_type="dataset",
1156
- token=token
1157
- )
1158
-
1159
- # Clean up local file
1160
- os.remove(filename)
1161
-
1162
- print(f"✓ Saved agent to HuggingFace: {filename}")
1163
- return True
 
 
1164
 
1165
  except Exception as e:
1166
  print(f"✗ Error saving agent: {str(e)}")
@@ -1192,21 +1322,23 @@ def save_leaderboard_to_hf(cache_dict):
1192
  filename = f"{year}.csv"
1193
  df.to_csv(filename, index=False)
1194
 
1195
- # Upload to HuggingFace
1196
- api = HfApi()
1197
- api.upload_file(
1198
- path_or_fileobj=filename,
1199
- path_in_repo=filename,
1200
- repo_id=LEADERBOARD_REPO,
1201
- repo_type="dataset",
1202
- token=token
1203
- )
1204
-
1205
- # Clean up local file
1206
- os.remove(filename)
1207
-
1208
- print(f"✓ Saved leaderboard to HuggingFace as {filename} ({len(data_list)} entries)")
1209
- return True
 
 
1210
 
1211
  except Exception as e:
1212
  print(f"✗ Error saving leaderboard: {str(e)}")
@@ -1222,10 +1354,12 @@ def update_all_agents_incremental():
1222
  Memory-efficient incremental update of PR statistics for all agents.
1223
 
1224
  Strategy:
1225
- 1. For each agent, check latest PR date from stored metadata
1226
- 2. Only fetch NEW PRs created after that date
1227
- 3. Store minimal metadata (not full PR objects) to avoid storage limits
1228
- 4. Construct leaderboard from stored metadata
 
 
1229
 
1230
  Returns dictionary of all agent data with current stats.
1231
  """
@@ -1254,32 +1388,39 @@ def update_all_agents_incremental():
1254
  print(f"Processing: {agent_name} ({identifier})")
1255
  print(f"{'='*80}")
1256
 
1257
- # Check for existing metadata to determine incremental update date
1258
- latest_pr_date = get_latest_pr_date_for_agent(identifier)
1259
-
1260
- if latest_pr_date:
1261
- print(f"📅 Latest PR found: {latest_pr_date.strftime('%Y-%m-%d %H:%M:%S')}")
1262
- print(f" Fetching only PRs created after this date...")
1263
- start_from = latest_pr_date + timedelta(seconds=1) # Start 1 second after
 
 
 
 
 
 
 
1264
  else:
1265
- print(f"📅 No existing PRs found. Fetching all PR metadata...")
1266
- start_from = None
1267
-
1268
- # Fetch PR metadata (lightweight, memory-efficient)
1269
- new_metadata = fetch_all_prs_metadata(
1270
- identifier,
1271
- agent_name,
1272
- token,
1273
- start_from_date=start_from
1274
- )
1275
 
1276
  if new_metadata:
1277
- # Save new metadata to HuggingFace (organized by agent_identifier/year.month)
1278
  print(f"💾 Saving {len(new_metadata)} new PR records...")
1279
  save_pr_metadata_to_hf(new_metadata, identifier)
 
 
1280
 
1281
- # Load all metadata for current year to calculate stats
1282
- print(f"📊 Calculating statistics from stored metadata...")
1283
  all_year_metadata = load_pr_metadata_for_year(current_year)
1284
 
1285
  # Filter for this specific agent
 
347
  }
348
 
349
 
350
+ def fetch_all_prs_metadata(identifier, agent_name, token=None, start_from_date=None, year=None, exclude_dates=None):
351
  """
352
  Fetch pull requests associated with a GitHub user/bot for the past 6 months.
353
  Returns lightweight metadata instead of full PR objects.
 
364
  token: GitHub API token
365
  start_from_date: Only fetch PRs created after this date (for incremental updates)
366
  year: Year parameter (deprecated, kept for compatibility but not used)
367
+ exclude_dates: Set of date objects to exclude from mining (already-mined dates)
368
 
369
  Returns:
370
  List of minimal PR metadata dictionaries
 
428
 
429
  # Convert to lightweight metadata
430
  all_prs = list(prs_by_id.values())
431
+
432
+ # Filter out PRs from excluded dates if specified
433
+ if exclude_dates:
434
+ filtered_prs = []
435
+ excluded_count = 0
436
+ for pr in all_prs:
437
+ created_at = pr.get('created_at')
438
+ if created_at:
439
+ try:
440
+ dt = datetime.fromisoformat(created_at.replace('Z', '+00:00'))
441
+ pr_date = dt.date()
442
+ if pr_date not in exclude_dates:
443
+ filtered_prs.append(pr)
444
+ else:
445
+ excluded_count += 1
446
+ except Exception:
447
+ filtered_prs.append(pr) # Keep PRs with unparseable dates
448
+ else:
449
+ filtered_prs.append(pr) # Keep PRs without created_at
450
+
451
+ if excluded_count > 0:
452
+ print(f" ⏭️ Skipped {excluded_count} PRs from already-mined dates")
453
+ all_prs = filtered_prs
454
+
455
  if DEBUG_MODE:
456
  print(f"\n✅ COMPLETE (DEBUG MODE): Found {len(all_prs)} unique PRs for {identifier}")
457
  print(f" Note: In production mode, this would fetch ALL PRs")
 
700
  # Save locally
701
  save_jsonl(local_filename, merged_metadata)
702
 
703
+ try:
704
+ # Upload to HuggingFace with folder path
705
+ upload_with_retry(
706
+ api=api,
707
+ path_or_fileobj=local_filename,
708
+ path_in_repo=filename,
709
+ repo_id=PR_METADATA_REPO,
710
+ repo_type="dataset",
711
+ token=token
712
+ )
713
+ print(f" ✓ Saved {len(merged_metadata)} total PRs to {filename}")
714
+ finally:
715
+ # Always clean up local file, even if upload fails
716
+ if os.path.exists(local_filename):
717
+ os.remove(local_filename)
718
 
719
  return True
720
 
 
918
  return []
919
 
920
 
921
+ def get_already_mined_dates(agent_identifier, n_months=6):
922
+ """
923
+ Get set of dates that have already been mined for an agent.
924
+
925
+ Args:
926
+ agent_identifier: GitHub identifier of the agent
927
+ n_months: Number of months to look back (default: 6)
928
+
929
+ Returns:
930
+ Set of date objects (datetime.date) that already have data files
931
+ """
932
+ try:
933
+ api = HfApi()
934
+
935
+ # Calculate date range
936
+ today = datetime.now(timezone.utc)
937
+ n_months_ago = today - timedelta(days=30 * n_months)
938
+
939
+ # List all files in the repository
940
+ files = api.list_repo_files(repo_id=PR_METADATA_REPO, repo_type="dataset")
941
+
942
+ # Filter for files in this agent's folder
943
+ agent_pattern = f"{agent_identifier}/"
944
+ agent_files = [f for f in files if f.startswith(agent_pattern) and f.endswith('.jsonl')]
945
+
946
+ mined_dates = set()
947
+ for filename in agent_files:
948
+ try:
949
+ # Extract date from filename: [agent_identifier]/YYYY.MM.DD.jsonl
950
+ parts = filename.split('/')
951
+ if len(parts) != 2:
952
+ continue
953
+
954
+ date_part = parts[1].replace('.jsonl', '') # Get YYYY.MM.DD
955
+ date_components = date_part.split('.')
956
+ if len(date_components) != 3:
957
+ continue
958
+
959
+ file_year, file_month, file_day = map(int, date_components)
960
+ file_date = datetime(file_year, file_month, file_day, tzinfo=timezone.utc).date()
961
+
962
+ # Only include dates within the last n_months
963
+ if n_months_ago.date() <= file_date <= today.date():
964
+ mined_dates.add(file_date)
965
+ except Exception as e:
966
+ print(f" Warning: Could not parse date from filename {filename}: {e}")
967
+ continue
968
+
969
+ return mined_dates
970
+
971
+ except Exception as e:
972
+ print(f" Warning: Could not get already-mined dates for {agent_identifier}: {str(e)}")
973
+ return set()
974
+
975
+
976
  def fetch_pr_current_status(pr_url, token):
977
  """
978
  Fetch the current status of a single PR from GitHub API.
 
1106
  # Save locally
1107
  save_jsonl(local_filename, updated_prs)
1108
 
1109
+ try:
1110
+ # Upload back to HuggingFace
1111
+ api = HfApi()
1112
+ upload_with_retry(
1113
+ api=api,
1114
+ path_or_fileobj=local_filename,
1115
+ path_in_repo=filename,
1116
+ repo_id=PR_METADATA_REPO,
1117
+ repo_type="dataset",
1118
+ token=get_hf_token()
1119
+ )
1120
+ print(f" 💾 Updated {filename}")
1121
+ finally:
1122
+ # Always clean up local file, even if upload fails
1123
+ if os.path.exists(local_filename):
1124
+ os.remove(local_filename)
1125
 
1126
  except Exception as e:
1127
  print(f" Warning: Could not process {filename}: {str(e)}")
 
1216
  return token
1217
 
1218
 
1219
+ def upload_with_retry(api, path_or_fileobj, path_in_repo, repo_id, repo_type, token, max_retries=5):
1220
+ """
1221
+ Upload file to HuggingFace with exponential backoff retry logic.
1222
+
1223
+ Args:
1224
+ api: HfApi instance
1225
+ path_or_fileobj: Local file path to upload
1226
+ path_in_repo: Target path in the repository
1227
+ repo_id: Repository ID
1228
+ repo_type: Type of repository (e.g., "dataset")
1229
+ token: HuggingFace token
1230
+ max_retries: Maximum number of retry attempts
1231
+
1232
+ Returns:
1233
+ True if upload succeeded, raises exception if all retries failed
1234
+ """
1235
+ delay = 2.0 # Initial delay in seconds
1236
+
1237
+ for attempt in range(max_retries):
1238
+ try:
1239
+ api.upload_file(
1240
+ path_or_fileobj=path_or_fileobj,
1241
+ path_in_repo=path_in_repo,
1242
+ repo_id=repo_id,
1243
+ repo_type=repo_type,
1244
+ token=token
1245
+ )
1246
+ if attempt > 0:
1247
+ print(f" ✓ Upload succeeded on attempt {attempt + 1}/{max_retries}")
1248
+ return True
1249
+
1250
+ except Exception as e:
1251
+ if attempt < max_retries - 1:
1252
+ wait_time = delay + random.uniform(0, 1.0)
1253
+ print(f" ⚠️ Upload failed (attempt {attempt + 1}/{max_retries}): {str(e)}")
1254
+ print(f" ⏳ Retrying in {wait_time:.1f} seconds...")
1255
+ time.sleep(wait_time)
1256
+ delay = min(delay * 2, 60.0) # Exponential backoff, max 60s
1257
+ else:
1258
+ print(f" ✗ Upload failed after {max_retries} attempts: {str(e)}")
1259
+ raise
1260
+
1261
+
1262
  def save_agent_to_hf(data):
1263
  """Save a new agent to HuggingFace dataset as {identifier}.json in root."""
1264
  try:
 
1275
  with open(filename, 'w') as f:
1276
  json.dump(data, f, indent=2)
1277
 
1278
+ try:
1279
+ # Upload to HuggingFace (root directory)
1280
+ upload_with_retry(
1281
+ api=api,
1282
+ path_or_fileobj=filename,
1283
+ path_in_repo=filename,
1284
+ repo_id=AGENTS_REPO,
1285
+ repo_type="dataset",
1286
+ token=token
1287
+ )
1288
+ print(f"✓ Saved agent to HuggingFace: {filename}")
1289
+ return True
1290
+ finally:
1291
+ # Always clean up local file, even if upload fails
1292
+ if os.path.exists(filename):
1293
+ os.remove(filename)
1294
 
1295
  except Exception as e:
1296
  print(f"✗ Error saving agent: {str(e)}")
 
1322
  filename = f"{year}.csv"
1323
  df.to_csv(filename, index=False)
1324
 
1325
+ try:
1326
+ # Upload to HuggingFace
1327
+ api = HfApi()
1328
+ upload_with_retry(
1329
+ api=api,
1330
+ path_or_fileobj=filename,
1331
+ path_in_repo=filename,
1332
+ repo_id=LEADERBOARD_REPO,
1333
+ repo_type="dataset",
1334
+ token=token
1335
+ )
1336
+ print(f"✓ Saved leaderboard to HuggingFace as {filename} ({len(data_list)} entries)")
1337
+ return True
1338
+ finally:
1339
+ # Always clean up local file, even if upload fails
1340
+ if os.path.exists(filename):
1341
+ os.remove(filename)
1342
 
1343
  except Exception as e:
1344
  print(f"✗ Error saving leaderboard: {str(e)}")
 
1354
  Memory-efficient incremental update of PR statistics for all agents.
1355
 
1356
  Strategy:
1357
+ 1. For each agent, load existing data from SWE-Arena/pr_metadata
1358
+ 2. Identify already-mined dates (based on filename: YYYY.MM.DD.jsonl)
1359
+ 3. Only fetch PRs from dates that haven't been mined yet (within last 6 months)
1360
+ 4. If no data exists at all, mine everything from scratch
1361
+ 5. Store minimal metadata (not full PR objects) to avoid storage limits
1362
+ 6. Construct leaderboard from ALL stored metadata (last 6 months)
1363
 
1364
  Returns dictionary of all agent data with current stats.
1365
  """
 
1388
  print(f"Processing: {agent_name} ({identifier})")
1389
  print(f"{'='*80}")
1390
 
1391
+ # Get already-mined dates for this agent (last 6 months)
1392
+ already_mined_dates = get_already_mined_dates(identifier, n_months=6)
1393
+
1394
+ if already_mined_dates:
1395
+ print(f"📅 Found {len(already_mined_dates)} already-mined dates")
1396
+ print(f" Skipping these dates and fetching only new data...")
1397
+ # Fetch only PRs from dates not yet mined
1398
+ new_metadata = fetch_all_prs_metadata(
1399
+ identifier,
1400
+ agent_name,
1401
+ token,
1402
+ start_from_date=None, # Use full 6-month range
1403
+ exclude_dates=already_mined_dates # But exclude already-mined dates
1404
+ )
1405
  else:
1406
+ print(f"📅 No existing data found. Mining everything from scratch...")
1407
+ # Mine everything from scratch (full 6-month range)
1408
+ new_metadata = fetch_all_prs_metadata(
1409
+ identifier,
1410
+ agent_name,
1411
+ token,
1412
+ start_from_date=None
1413
+ )
 
 
1414
 
1415
  if new_metadata:
1416
+ # Save new metadata to HuggingFace (organized by agent_identifier/YYYY.MM.DD.jsonl)
1417
  print(f"💾 Saving {len(new_metadata)} new PR records...")
1418
  save_pr_metadata_to_hf(new_metadata, identifier)
1419
+ else:
1420
+ print(f" No new PRs to save")
1421
 
1422
+ # Load ALL metadata for current year to calculate stats (aggregates entire last 6 months)
1423
+ print(f"📊 Calculating statistics from ALL stored metadata (last 6 months)...")
1424
  all_year_metadata = load_pr_metadata_for_year(current_year)
1425
 
1426
  # Filter for this specific agent
msr.py CHANGED
@@ -126,6 +126,49 @@ def get_hf_token():
126
  return token
127
 
128
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
129
  # =============================================================================
130
  # GitHub API with backoff (same as app.py)
131
  # =============================================================================
@@ -258,7 +301,7 @@ def extract_pr_metadata(pr):
258
  }
259
 
260
 
261
- def fetch_all_prs_metadata(identifier, agent_name, token=None, start_from_date=None, year=None):
262
  headers = {'Authorization': f'token {token}'} if token else {}
263
  debug_limit_per_pattern = 10 if DEBUG_MODE else None
264
  if DEBUG_MODE:
@@ -295,6 +338,30 @@ def fetch_all_prs_metadata(identifier, agent_name, token=None, start_from_date=N
295
  print(f" ⏱️ Time taken: {pattern_duration:.1f} seconds")
296
  time.sleep(0.2 if DEBUG_MODE else 1.0)
297
  all_prs = list(prs_by_id.values())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
298
  if DEBUG_MODE:
299
  print(f"\n✅ COMPLETE (DEBUG MODE): Found {len(all_prs)} unique PRs for {identifier}")
300
  print(f" Note: In production mode, this would fetch ALL PRs")
@@ -361,15 +428,20 @@ def save_pr_metadata_to_hf(metadata_list, agent_identifier):
361
  existing_by_url.update(new_by_url)
362
  merged_metadata = list(existing_by_url.values())
363
  save_jsonl(local_filename, merged_metadata)
364
- api.upload_file(
365
- path_or_fileobj=local_filename,
366
- path_in_repo=filename,
367
- repo_id=PR_METADATA_REPO,
368
- repo_type="dataset",
369
- token=token
370
- )
371
- os.remove(local_filename)
372
- print(f" ✓ Saved {len(merged_metadata)} total PRs to {filename}")
 
 
 
 
 
373
  return True
374
  except Exception as e:
375
  print(f"✗ Error saving PR metadata: {str(e)}")
@@ -493,6 +565,61 @@ def get_latest_pr_date_for_agent(agent_identifier):
493
  return None
494
 
495
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
496
  def save_leaderboard_to_hf(cache_dict):
497
  if DEBUG_MODE:
498
  global DEBUG_LEADERBOARD_CACHE
@@ -510,16 +637,21 @@ def save_leaderboard_to_hf(cache_dict):
510
  filename = f"{year}.csv"
511
  df.to_csv(filename, index=False)
512
  api = HfApi()
513
- api.upload_file(
514
- path_or_fileobj=filename,
515
- path_in_repo=filename,
516
- repo_id=LEADERBOARD_REPO,
517
- repo_type="dataset",
518
- token=token
519
- )
520
- os.remove(filename)
521
- print(f"✓ Saved leaderboard to HuggingFace as {filename} ({len(data_list)} entries)")
522
- return True
 
 
 
 
 
523
  except Exception as e:
524
  print(f"✗ Error saving leaderboard: {str(e)}")
525
  return False
@@ -539,6 +671,19 @@ def calculate_pr_stats_from_metadata(metadata_list):
539
 
540
 
541
  def update_all_agents_incremental():
 
 
 
 
 
 
 
 
 
 
 
 
 
542
  token = get_github_token()
543
  current_year = datetime.now().year
544
  agents = load_agents_from_hf()
@@ -556,24 +701,39 @@ def update_all_agents_incremental():
556
  print(f"\n{'='*80}")
557
  print(f"Processing: {agent_name} ({identifier})")
558
  print(f"{'='*80}")
559
- latest_pr_date = get_latest_pr_date_for_agent(identifier)
560
- if latest_pr_date:
561
- print(f"📅 Latest PR found: {latest_pr_date.strftime('%Y-%m-%d %H:%M:%S')}")
562
- print(f" Fetching only PRs created after this date...")
563
- start_from = latest_pr_date + timedelta(seconds=1)
 
 
 
 
 
 
 
 
 
 
564
  else:
565
- print(f"📅 No existing PRs found. Fetching all PR metadata...")
566
- start_from = None
567
- new_metadata = fetch_all_prs_metadata(
568
- identifier,
569
- agent_name,
570
- token,
571
- start_from_date=start_from
572
- )
 
573
  if new_metadata:
574
  print(f"💾 Saving {len(new_metadata)} new PR records...")
575
  save_pr_metadata_to_hf(new_metadata, identifier)
576
- print(f"📊 Calculating statistics from stored metadata...")
 
 
 
 
577
  all_year_metadata = load_pr_metadata_for_year(current_year)
578
  agent_metadata = [pr for pr in all_year_metadata if pr.get('agent_identifier') == identifier]
579
  stats = calculate_pr_stats_from_metadata(agent_metadata)
 
126
  return token
127
 
128
 
129
+ def upload_with_retry(api, path_or_fileobj, path_in_repo, repo_id, repo_type, token, max_retries=5):
130
+ """
131
+ Upload file to HuggingFace with exponential backoff retry logic.
132
+
133
+ Args:
134
+ api: HfApi instance
135
+ path_or_fileobj: Local file path to upload
136
+ path_in_repo: Target path in the repository
137
+ repo_id: Repository ID
138
+ repo_type: Type of repository (e.g., "dataset")
139
+ token: HuggingFace token
140
+ max_retries: Maximum number of retry attempts
141
+
142
+ Returns:
143
+ True if upload succeeded, raises exception if all retries failed
144
+ """
145
+ delay = 2.0 # Initial delay in seconds
146
+
147
+ for attempt in range(max_retries):
148
+ try:
149
+ api.upload_file(
150
+ path_or_fileobj=path_or_fileobj,
151
+ path_in_repo=path_in_repo,
152
+ repo_id=repo_id,
153
+ repo_type=repo_type,
154
+ token=token
155
+ )
156
+ if attempt > 0:
157
+ print(f" ✓ Upload succeeded on attempt {attempt + 1}/{max_retries}")
158
+ return True
159
+
160
+ except Exception as e:
161
+ if attempt < max_retries - 1:
162
+ wait_time = delay + random.uniform(0, 1.0)
163
+ print(f" ⚠️ Upload failed (attempt {attempt + 1}/{max_retries}): {str(e)}")
164
+ print(f" ⏳ Retrying in {wait_time:.1f} seconds...")
165
+ time.sleep(wait_time)
166
+ delay = min(delay * 2, 60.0) # Exponential backoff, max 60s
167
+ else:
168
+ print(f" ✗ Upload failed after {max_retries} attempts: {str(e)}")
169
+ raise
170
+
171
+
172
  # =============================================================================
173
  # GitHub API with backoff (same as app.py)
174
  # =============================================================================
 
301
  }
302
 
303
 
304
+ def fetch_all_prs_metadata(identifier, agent_name, token=None, start_from_date=None, year=None, exclude_dates=None):
305
  headers = {'Authorization': f'token {token}'} if token else {}
306
  debug_limit_per_pattern = 10 if DEBUG_MODE else None
307
  if DEBUG_MODE:
 
338
  print(f" ⏱️ Time taken: {pattern_duration:.1f} seconds")
339
  time.sleep(0.2 if DEBUG_MODE else 1.0)
340
  all_prs = list(prs_by_id.values())
341
+
342
+ # Filter out PRs from excluded dates if specified
343
+ if exclude_dates:
344
+ filtered_prs = []
345
+ excluded_count = 0
346
+ for pr in all_prs:
347
+ created_at = pr.get('created_at')
348
+ if created_at:
349
+ try:
350
+ dt = datetime.fromisoformat(created_at.replace('Z', '+00:00'))
351
+ pr_date = dt.date()
352
+ if pr_date not in exclude_dates:
353
+ filtered_prs.append(pr)
354
+ else:
355
+ excluded_count += 1
356
+ except Exception:
357
+ filtered_prs.append(pr) # Keep PRs with unparseable dates
358
+ else:
359
+ filtered_prs.append(pr) # Keep PRs without created_at
360
+
361
+ if excluded_count > 0:
362
+ print(f" ⏭️ Skipped {excluded_count} PRs from already-mined dates")
363
+ all_prs = filtered_prs
364
+
365
  if DEBUG_MODE:
366
  print(f"\n✅ COMPLETE (DEBUG MODE): Found {len(all_prs)} unique PRs for {identifier}")
367
  print(f" Note: In production mode, this would fetch ALL PRs")
 
428
  existing_by_url.update(new_by_url)
429
  merged_metadata = list(existing_by_url.values())
430
  save_jsonl(local_filename, merged_metadata)
431
+ try:
432
+ upload_with_retry(
433
+ api=api,
434
+ path_or_fileobj=local_filename,
435
+ path_in_repo=filename,
436
+ repo_id=PR_METADATA_REPO,
437
+ repo_type="dataset",
438
+ token=token
439
+ )
440
+ print(f" ✓ Saved {len(merged_metadata)} total PRs to {filename}")
441
+ finally:
442
+ # Always clean up the local file, even if upload fails
443
+ if os.path.exists(local_filename):
444
+ os.remove(local_filename)
445
  return True
446
  except Exception as e:
447
  print(f"✗ Error saving PR metadata: {str(e)}")
 
565
  return None
566
 
567
 
568
+ def get_already_mined_dates(agent_identifier, n_months=6):
569
+ """
570
+ Get set of dates that have already been mined for an agent.
571
+
572
+ Args:
573
+ agent_identifier: GitHub identifier of the agent
574
+ n_months: Number of months to look back (default: 6)
575
+
576
+ Returns:
577
+ Set of date objects (datetime.date) that already have data files
578
+ """
579
+ try:
580
+ api = HfApi()
581
+
582
+ # Calculate date range
583
+ today = datetime.now(timezone.utc)
584
+ n_months_ago = today - timedelta(days=30 * n_months)
585
+
586
+ # List all files in the repository
587
+ files = api.list_repo_files(repo_id=PR_METADATA_REPO, repo_type="dataset")
588
+
589
+ # Filter for files in this agent's folder
590
+ agent_pattern = f"{agent_identifier}/"
591
+ agent_files = [f for f in files if f.startswith(agent_pattern) and f.endswith('.jsonl')]
592
+
593
+ mined_dates = set()
594
+ for filename in agent_files:
595
+ try:
596
+ # Extract date from filename: [agent_identifier]/YYYY.MM.DD.jsonl
597
+ parts = filename.split('/')
598
+ if len(parts) != 2:
599
+ continue
600
+
601
+ date_part = parts[1].replace('.jsonl', '') # Get YYYY.MM.DD
602
+ date_components = date_part.split('.')
603
+ if len(date_components) != 3:
604
+ continue
605
+
606
+ file_year, file_month, file_day = map(int, date_components)
607
+ file_date = datetime(file_year, file_month, file_day, tzinfo=timezone.utc).date()
608
+
609
+ # Only include dates within the last n_months
610
+ if n_months_ago.date() <= file_date <= today.date():
611
+ mined_dates.add(file_date)
612
+ except Exception as e:
613
+ print(f" Warning: Could not parse date from filename {filename}: {e}")
614
+ continue
615
+
616
+ return mined_dates
617
+
618
+ except Exception as e:
619
+ print(f" Warning: Could not get already-mined dates for {agent_identifier}: {str(e)}")
620
+ return set()
621
+
622
+
623
  def save_leaderboard_to_hf(cache_dict):
624
  if DEBUG_MODE:
625
  global DEBUG_LEADERBOARD_CACHE
 
637
  filename = f"{year}.csv"
638
  df.to_csv(filename, index=False)
639
  api = HfApi()
640
+ try:
641
+ upload_with_retry(
642
+ api=api,
643
+ path_or_fileobj=filename,
644
+ path_in_repo=filename,
645
+ repo_id=LEADERBOARD_REPO,
646
+ repo_type="dataset",
647
+ token=token
648
+ )
649
+ print(f"✓ Saved leaderboard to HuggingFace as {filename} ({len(data_list)} entries)")
650
+ return True
651
+ finally:
652
+ # Always clean up local file, even if upload fails
653
+ if os.path.exists(filename):
654
+ os.remove(filename)
655
  except Exception as e:
656
  print(f"✗ Error saving leaderboard: {str(e)}")
657
  return False
 
671
 
672
 
673
  def update_all_agents_incremental():
674
+ """
675
+ Memory-efficient incremental update of PR statistics for all agents.
676
+
677
+ Strategy:
678
+ 1. For each agent, load existing data from SWE-Arena/pr_metadata
679
+ 2. Identify already-mined dates (based on filename: YYYY.MM.DD.jsonl)
680
+ 3. Only fetch PRs from dates that haven't been mined yet (within last 6 months)
681
+ 4. If no data exists at all, mine everything from scratch
682
+ 5. Store minimal metadata (not full PR objects) to avoid storage limits
683
+ 6. Construct leaderboard from ALL stored metadata (last 6 months)
684
+
685
+ Returns dictionary of all agent data with current stats.
686
+ """
687
  token = get_github_token()
688
  current_year = datetime.now().year
689
  agents = load_agents_from_hf()
 
701
  print(f"\n{'='*80}")
702
  print(f"Processing: {agent_name} ({identifier})")
703
  print(f"{'='*80}")
704
+
705
+ # Get already-mined dates for this agent (last 6 months)
706
+ already_mined_dates = get_already_mined_dates(identifier, n_months=6)
707
+
708
+ if already_mined_dates:
709
+ print(f"📅 Found {len(already_mined_dates)} already-mined dates")
710
+ print(f" Skipping these dates and fetching only new data...")
711
+ # Fetch only PRs from dates not yet mined
712
+ new_metadata = fetch_all_prs_metadata(
713
+ identifier,
714
+ agent_name,
715
+ token,
716
+ start_from_date=None, # Use full 6-month range
717
+ exclude_dates=already_mined_dates # But exclude already-mined dates
718
+ )
719
  else:
720
+ print(f"📅 No existing data found. Mining everything from scratch...")
721
+ # Mine everything from scratch (full 6-month range)
722
+ new_metadata = fetch_all_prs_metadata(
723
+ identifier,
724
+ agent_name,
725
+ token,
726
+ start_from_date=None
727
+ )
728
+
729
  if new_metadata:
730
  print(f"💾 Saving {len(new_metadata)} new PR records...")
731
  save_pr_metadata_to_hf(new_metadata, identifier)
732
+ else:
733
+ print(f" No new PRs to save")
734
+
735
+ # Load ALL metadata for current year to calculate stats (aggregates entire last 6 months)
736
+ print(f"📊 Calculating statistics from ALL stored metadata (last 6 months)...")
737
  all_year_metadata = load_pr_metadata_for_year(current_year)
738
  agent_metadata = [pr for pr in all_year_metadata if pr.get('agent_identifier') == identifier]
739
  stats = calculate_pr_stats_from_metadata(agent_metadata)