zhiminy commited on
Commit
c27bef0
·
1 Parent(s): 7d38bb7
Files changed (1) hide show
  1. msr.py +693 -0
msr.py ADDED
@@ -0,0 +1,693 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Minimalist PR Metadata Mining Script
3
+ Mines PR metadata from GitHub and saves to HuggingFace dataset.
4
+ """
5
+
6
+ import json
7
+ import os
8
+ import time
9
+ import requests
10
+ from datetime import datetime, timezone, timedelta
11
+ from collections import defaultdict
12
+ from huggingface_hub import HfApi, hf_hub_download
13
+ from dotenv import load_dotenv
14
+ import random
15
+
16
+ # Load environment variables
17
+ load_dotenv()
18
+
19
+ # =============================================================================
20
+ # CONFIGURATION
21
+ # =============================================================================
22
+
23
+ AGENTS_REPO = "SWE-Arena/swe_agents"
24
+ PR_METADATA_REPO = "SWE-Arena/pr_metadata"
25
+ LEADERBOARD_TIME_FRAME_DAYS = 180 # 6 months
26
+
27
+ # =============================================================================
28
+ # UTILITY FUNCTIONS
29
+ # =============================================================================
30
+
31
+ def load_jsonl(filename):
32
+ """Load JSONL file and return list of dictionaries."""
33
+ if not os.path.exists(filename):
34
+ return []
35
+
36
+ data = []
37
+ with open(filename, 'r', encoding='utf-8') as f:
38
+ for line in f:
39
+ line = line.strip()
40
+ if line:
41
+ try:
42
+ data.append(json.loads(line))
43
+ except json.JSONDecodeError as e:
44
+ print(f"Warning: Skipping invalid JSON line: {e}")
45
+ return data
46
+
47
+
48
+ def save_jsonl(filename, data):
49
+ """Save list of dictionaries to JSONL file."""
50
+ with open(filename, 'w', encoding='utf-8') as f:
51
+ for item in data:
52
+ f.write(json.dumps(item) + '\n')
53
+
54
+
55
+ def get_github_token():
56
+ """Get GitHub token from environment variables."""
57
+ token = os.getenv('GITHUB_TOKEN')
58
+ if not token:
59
+ print("Warning: GITHUB_TOKEN not found. API rate limits: 60/hour (authenticated: 5000/hour)")
60
+ return token
61
+
62
+
63
+ def get_hf_token():
64
+ """Get HuggingFace token from environment variables."""
65
+ token = os.getenv('HF_TOKEN')
66
+ if not token:
67
+ print("Warning: HF_TOKEN not found in environment variables")
68
+ return token
69
+
70
+
71
+ # =============================================================================
72
+ # GITHUB API FUNCTIONS
73
+ # =============================================================================
74
+
75
+ def request_with_backoff(method, url, *, headers=None, params=None, json_body=None, data=None, max_retries=10, timeout=30):
76
+ """
77
+ Perform an HTTP request with exponential backoff and jitter for GitHub API.
78
+ Retries on 403/429 (rate limits), 5xx server errors, and transient network exceptions.
79
+ """
80
+ delay = 1.0
81
+ for attempt in range(max_retries):
82
+ try:
83
+ resp = requests.request(
84
+ method,
85
+ url,
86
+ headers=headers or {},
87
+ params=params,
88
+ json=json_body,
89
+ data=data,
90
+ timeout=timeout
91
+ )
92
+
93
+ status = resp.status_code
94
+
95
+ # Success
96
+ if 200 <= status < 300:
97
+ return resp
98
+
99
+ # Rate limits or server errors -> retry with backoff
100
+ if status in (403, 429) or 500 <= status < 600:
101
+ wait = None
102
+
103
+ # Prefer Retry-After when present
104
+ retry_after = resp.headers.get('Retry-After') or resp.headers.get('retry-after')
105
+ if retry_after:
106
+ try:
107
+ wait = float(retry_after)
108
+ except Exception:
109
+ wait = None
110
+
111
+ # Fallback to X-RateLimit-Reset when 403/429
112
+ if wait is None and status in (403, 429):
113
+ reset_hdr = resp.headers.get('X-RateLimit-Reset') or resp.headers.get('x-ratelimit-reset')
114
+ if reset_hdr:
115
+ try:
116
+ reset_ts = int(float(reset_hdr))
117
+ wait = max(reset_ts - time.time() + 2, 1)
118
+ except Exception:
119
+ wait = None
120
+
121
+ # Final fallback: exponential backoff with jitter
122
+ if wait is None:
123
+ wait = delay + random.uniform(0, 0.5)
124
+
125
+ # Cap individual wait to avoid extreme sleeps
126
+ wait = max(1.0, min(wait, 120.0))
127
+ print(f"GitHub API {status}. Backing off {wait:.1f}s (attempt {attempt + 1}/{max_retries})...")
128
+ time.sleep(wait)
129
+ delay = min(delay * 2, 60.0)
130
+ continue
131
+
132
+ # Non-retryable error; return response for caller to handle
133
+ return resp
134
+
135
+ except requests.RequestException as e:
136
+ # Network error -> retry with backoff
137
+ wait = delay + random.uniform(0, 0.5)
138
+ wait = max(1.0, min(wait, 60.0))
139
+ print(f"Request error: {e}. Retrying in {wait:.1f}s (attempt {attempt + 1}/{max_retries})...")
140
+ time.sleep(wait)
141
+ delay = min(delay * 2, 60.0)
142
+
143
+ print(f"Exceeded max retries for {url}")
144
+ return None
145
+
146
+
147
+ def fetch_prs_with_time_partition(base_query, start_date, end_date, headers, prs_by_id, depth=0):
148
+ """
149
+ Fetch PRs within a specific time range using time-based partitioning.
150
+ Recursively splits the time range if hitting the 1000-result limit.
151
+ Supports splitting by day, hour, minute, and second as needed.
152
+
153
+ Returns the number of PRs found in this time partition.
154
+ """
155
+ # Calculate time difference
156
+ time_diff = end_date - start_date
157
+ total_seconds = time_diff.total_seconds()
158
+
159
+ # Determine granularity and format dates accordingly
160
+ if total_seconds >= 86400: # >= 1 day
161
+ # Use day granularity (YYYY-MM-DD)
162
+ start_str = start_date.strftime('%Y-%m-%d')
163
+ end_str = end_date.strftime('%Y-%m-%d')
164
+ elif total_seconds >= 3600: # >= 1 hour but < 1 day
165
+ # Use hour granularity (YYYY-MM-DDTHH:MM:SSZ)
166
+ start_str = start_date.strftime('%Y-%m-%dT%H:00:00Z')
167
+ end_str = end_date.strftime('%Y-%m-%dT%H:59:59Z')
168
+ elif total_seconds >= 60: # >= 1 minute but < 1 hour
169
+ # Use minute granularity (YYYY-MM-DDTHH:MM:SSZ)
170
+ start_str = start_date.strftime('%Y-%m-%dT%H:%M:00Z')
171
+ end_str = end_date.strftime('%Y-%m-%dT%H:%M:59Z')
172
+ else: # < 1 minute
173
+ # Use second granularity (YYYY-MM-DDTHH:MM:SSZ)
174
+ start_str = start_date.strftime('%Y-%m-%dT%H:%M:%SZ')
175
+ end_str = end_date.strftime('%Y-%m-%dT%H:%M:%SZ')
176
+
177
+ # Add date range to query
178
+ query = f'{base_query} created:{start_str}..{end_str}'
179
+
180
+ indent = " " + " " * depth
181
+ print(f"{indent}Searching range {start_str} to {end_str}...")
182
+
183
+ page = 1
184
+ per_page = 100
185
+ total_in_partition = 0
186
+
187
+ while True:
188
+ url = 'https://api.github.com/search/issues'
189
+ params = {
190
+ 'q': query,
191
+ 'per_page': per_page,
192
+ 'page': page,
193
+ 'sort': 'created',
194
+ 'order': 'asc'
195
+ }
196
+
197
+ try:
198
+ response = request_with_backoff('GET', url, headers=headers, params=params)
199
+ if response is None:
200
+ print(f"{indent} Error: retries exhausted for range {start_str} to {end_str}")
201
+ return total_in_partition
202
+
203
+ if response.status_code != 200:
204
+ print(f"{indent} Error: HTTP {response.status_code} for range {start_str} to {end_str}")
205
+ return total_in_partition
206
+
207
+ data = response.json()
208
+ total_count = data.get('total_count', 0)
209
+ items = data.get('items', [])
210
+
211
+ if not items:
212
+ break
213
+
214
+ # Add PRs to global dict
215
+ for pr in items:
216
+ pr_id = pr.get('id')
217
+ if pr_id and pr_id not in prs_by_id:
218
+ prs_by_id[pr_id] = pr
219
+ total_in_partition += 1
220
+
221
+ # Check if we hit the 1000-result limit
222
+ if total_count > 1000 and page == 10:
223
+ print(f"{indent} ⚠️ Hit 1000-result limit ({total_count} total). Splitting time range...")
224
+
225
+ # Determine how to split based on time range duration
226
+ if total_seconds < 2: # Less than 2 seconds - can't split further
227
+ print(f"{indent} ⚠️ Cannot split further (range < 2 seconds). Some results may be missing.")
228
+ break
229
+
230
+ elif total_seconds < 120: # Less than 2 minutes - split by seconds
231
+ # Split into 2-4 parts depending on range
232
+ num_splits = min(4, max(2, int(total_seconds / 30)))
233
+ split_duration = time_diff / num_splits
234
+ split_dates = [start_date + split_duration * i for i in range(num_splits + 1)]
235
+
236
+ total_from_splits = 0
237
+ for i in range(num_splits):
238
+ split_start = split_dates[i]
239
+ split_end = split_dates[i + 1]
240
+ # Avoid overlapping ranges (add 1 second to start)
241
+ if i > 0:
242
+ split_start = split_start + timedelta(seconds=1)
243
+
244
+ count = fetch_prs_with_time_partition(
245
+ base_query, split_start, split_end, headers, prs_by_id, depth + 1
246
+ )
247
+ total_from_splits += count
248
+
249
+ return total_from_splits
250
+
251
+ elif total_seconds < 7200: # Less than 2 hours - split by minutes
252
+ # Split into 2-4 parts
253
+ num_splits = min(4, max(2, int(total_seconds / 1800)))
254
+ split_duration = time_diff / num_splits
255
+ split_dates = [start_date + split_duration * i for i in range(num_splits + 1)]
256
+
257
+ total_from_splits = 0
258
+ for i in range(num_splits):
259
+ split_start = split_dates[i]
260
+ split_end = split_dates[i + 1]
261
+ # Avoid overlapping ranges (add 1 minute to start)
262
+ if i > 0:
263
+ split_start = split_start + timedelta(minutes=1)
264
+
265
+ count = fetch_prs_with_time_partition(
266
+ base_query, split_start, split_end, headers, prs_by_id, depth + 1
267
+ )
268
+ total_from_splits += count
269
+
270
+ return total_from_splits
271
+
272
+ elif total_seconds < 172800: # Less than 2 days - split by hours
273
+ # Split into 2-4 parts
274
+ num_splits = min(4, max(2, int(total_seconds / 43200)))
275
+ split_duration = time_diff / num_splits
276
+ split_dates = [start_date + split_duration * i for i in range(num_splits + 1)]
277
+
278
+ total_from_splits = 0
279
+ for i in range(num_splits):
280
+ split_start = split_dates[i]
281
+ split_end = split_dates[i + 1]
282
+ # Avoid overlapping ranges (add 1 hour to start)
283
+ if i > 0:
284
+ split_start = split_start + timedelta(hours=1)
285
+
286
+ count = fetch_prs_with_time_partition(
287
+ base_query, split_start, split_end, headers, prs_by_id, depth + 1
288
+ )
289
+ total_from_splits += count
290
+
291
+ return total_from_splits
292
+
293
+ else: # 2+ days - split by days
294
+ days_diff = time_diff.days
295
+
296
+ # Use aggressive splitting for large ranges or deep recursion
297
+ # Split into 4 parts if range is > 30 days, otherwise split in half
298
+ if days_diff > 30 or depth > 5:
299
+ # Split into 4 parts for more aggressive partitioning
300
+ quarter_diff = time_diff / 4
301
+ split_dates = [
302
+ start_date,
303
+ start_date + quarter_diff,
304
+ start_date + quarter_diff * 2,
305
+ start_date + quarter_diff * 3,
306
+ end_date
307
+ ]
308
+
309
+ total_from_splits = 0
310
+ for i in range(4):
311
+ split_start = split_dates[i]
312
+ split_end = split_dates[i + 1]
313
+ # Avoid overlapping ranges
314
+ if i > 0:
315
+ split_start = split_start + timedelta(days=1)
316
+
317
+ count = fetch_prs_with_time_partition(
318
+ base_query, split_start, split_end, headers, prs_by_id, depth + 1
319
+ )
320
+ total_from_splits += count
321
+
322
+ return total_from_splits
323
+ else:
324
+ # Binary split for smaller ranges
325
+ mid_date = start_date + time_diff / 2
326
+
327
+ # Recursively fetch both halves
328
+ count1 = fetch_prs_with_time_partition(
329
+ base_query, start_date, mid_date, headers, prs_by_id, depth + 1
330
+ )
331
+ count2 = fetch_prs_with_time_partition(
332
+ base_query, mid_date + timedelta(days=1), end_date, headers, prs_by_id, depth + 1
333
+ )
334
+
335
+ return count1 + count2
336
+
337
+ # Normal pagination: check if there are more pages
338
+ if len(items) < per_page or page >= 10:
339
+ break
340
+
341
+ page += 1
342
+ time.sleep(0.5) # Courtesy delay between pages
343
+
344
+ except Exception as e:
345
+ print(f"{indent} Error fetching range {start_str} to {end_str}: {str(e)}")
346
+ return total_in_partition
347
+
348
+ if total_in_partition > 0:
349
+ print(f"{indent} ✓ Found {total_in_partition} PRs in range {start_str} to {end_str}")
350
+
351
+ return total_in_partition
352
+
353
+
354
+ def extract_pr_metadata(pr):
355
+ """
356
+ Extract minimal PR metadata for efficient storage.
357
+ Only keeps essential fields: html_url, created_at, merged_at, closed_at.
358
+ """
359
+ pull_request = pr.get('pull_request', {})
360
+
361
+ # Extract dates
362
+ created_at = pr.get('created_at')
363
+ merged_at = pull_request.get('merged_at')
364
+ closed_at = pr.get('closed_at')
365
+
366
+ # Only store closed_at if PR is closed but not merged
367
+ if merged_at:
368
+ closed_at = None # Don't store redundant info
369
+
370
+ return {
371
+ 'html_url': pr.get('html_url'),
372
+ 'created_at': created_at,
373
+ 'merged_at': merged_at,
374
+ 'closed_at': closed_at
375
+ }
376
+
377
+
378
+ def fetch_all_prs_metadata(identifier, agent_name, token=None):
379
+ """
380
+ Fetch pull requests associated with a GitHub user or bot for the past LEADERBOARD_TIME_FRAME_DAYS.
381
+ Returns lightweight metadata instead of full PR objects.
382
+
383
+ This function employs time-based partitioning to navigate GitHub's 1000-result limit per query.
384
+ It searches using multiple query patterns:
385
+ - is:pr author:{identifier} (PRs authored by the bot)
386
+ - is:pr "co-authored-by: {identifier}" (PRs with commits co-authored by the bot)
387
+ - is:pr head:{identifier}/ (PRs with branch names starting with the bot identifier)
388
+
389
+ Args:
390
+ identifier: GitHub username or bot identifier
391
+ agent_name: Human-readable name of the agent for metadata purposes
392
+ token: GitHub API token for authentication
393
+
394
+ Returns:
395
+ List of dictionaries containing minimal PR metadata
396
+ """
397
+ headers = {'Authorization': f'token {token}'} if token else {}
398
+
399
+ # Define query patterns per rules:
400
+ # 1) author pattern only if identifier contains "[bot]"
401
+ # 2) co-author and head patterns use identifier with "[bot]" removed
402
+ stripped_id = identifier.replace('[bot]', '')
403
+ query_patterns = []
404
+ if '[bot]' in identifier:
405
+ query_patterns.append(f'is:pr author:{identifier}')
406
+ if stripped_id:
407
+ query_patterns.append(f'is:pr "co-authored-by: {stripped_id}"')
408
+ query_patterns.append(f'is:pr head:{stripped_id}/')
409
+
410
+ # Use a dict to deduplicate PRs by ID
411
+ prs_by_id = {}
412
+
413
+ # Define time range: past LEADERBOARD_TIME_FRAME_DAYS
414
+ current_time = datetime.now(timezone.utc)
415
+ start_date = current_time - timedelta(days=LEADERBOARD_TIME_FRAME_DAYS)
416
+ end_date = current_time
417
+
418
+ for query_pattern in query_patterns:
419
+ print(f"\n🔍 Searching with query: {query_pattern}")
420
+ print(f" Time range: {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}")
421
+
422
+ pattern_start_time = time.time()
423
+ initial_count = len(prs_by_id)
424
+
425
+ # Fetch with time partitioning
426
+ prs_found = fetch_prs_with_time_partition(
427
+ query_pattern,
428
+ start_date,
429
+ end_date,
430
+ headers,
431
+ prs_by_id
432
+ )
433
+
434
+ pattern_duration = time.time() - pattern_start_time
435
+ new_prs = len(prs_by_id) - initial_count
436
+
437
+ print(f" ✓ Pattern complete: {new_prs} new PRs found ({prs_found} total fetched, {len(prs_by_id) - initial_count - (prs_found - new_prs)} duplicates)")
438
+ print(f" ⏱️ Time taken: {pattern_duration:.1f} seconds")
439
+
440
+ time.sleep(1.0)
441
+
442
+ # Convert to lightweight metadata
443
+ all_prs = list(prs_by_id.values())
444
+
445
+ print(f"\n✅ COMPLETE: Found {len(all_prs)} unique PRs for {identifier}")
446
+ print(f"📦 Extracting minimal metadata...")
447
+
448
+ metadata_list = [extract_pr_metadata(pr) for pr in all_prs]
449
+
450
+ # Calculate memory savings
451
+ import sys
452
+ original_size = sys.getsizeof(str(all_prs))
453
+ metadata_size = sys.getsizeof(str(metadata_list))
454
+ savings_pct = ((original_size - metadata_size) / original_size * 100) if original_size > 0 else 0
455
+
456
+ print(f"💾 Memory efficiency: {original_size // 1024}KB → {metadata_size // 1024}KB (saved {savings_pct:.1f}%)")
457
+
458
+ return metadata_list
459
+
460
+
461
+ # =============================================================================
462
+ # HUGGINGFACE STORAGE FUNCTIONS
463
+ # =============================================================================
464
+
465
+ def group_metadata_by_date(metadata_list):
466
+ """
467
+ Group PR metadata by exact date (year.month.day) for efficient daily storage.
468
+ Returns dict: {(year, month, day): [metadata_list]}
469
+ """
470
+ grouped = defaultdict(list)
471
+
472
+ for pr_meta in metadata_list:
473
+ created_at = pr_meta.get('created_at')
474
+ if not created_at:
475
+ continue
476
+
477
+ try:
478
+ dt = datetime.fromisoformat(created_at.replace('Z', '+00:00'))
479
+ key = (dt.year, dt.month, dt.day)
480
+ grouped[key].append(pr_meta)
481
+ except Exception as e:
482
+ print(f"Warning: Could not parse date '{created_at}': {e}")
483
+
484
+ return dict(grouped)
485
+
486
+
487
+ def upload_with_retry(api, path_or_fileobj, path_in_repo, repo_id, repo_type, token, max_retries=5):
488
+ """
489
+ Upload file to HuggingFace with exponential backoff retry logic.
490
+ """
491
+ delay = 2.0
492
+
493
+ for attempt in range(max_retries):
494
+ try:
495
+ api.upload_file(
496
+ path_or_fileobj=path_or_fileobj,
497
+ path_in_repo=path_in_repo,
498
+ repo_id=repo_id,
499
+ repo_type=repo_type,
500
+ token=token
501
+ )
502
+ if attempt > 0:
503
+ print(f" ✓ Upload succeeded on attempt {attempt + 1}/{max_retries}")
504
+ return True
505
+
506
+ except Exception as e:
507
+ if attempt < max_retries - 1:
508
+ wait_time = delay + random.uniform(0, 1.0)
509
+ print(f" ⚠️ Upload failed (attempt {attempt + 1}/{max_retries}): {str(e)}")
510
+ print(f" ⏳ Retrying in {wait_time:.1f} seconds...")
511
+ time.sleep(wait_time)
512
+ delay = min(delay * 2, 60.0)
513
+ else:
514
+ print(f" ✗ Upload failed after {max_retries} attempts: {str(e)}")
515
+ raise
516
+
517
+
518
+ def save_pr_metadata_to_hf(metadata_list, agent_identifier):
519
+ """
520
+ Save PR metadata to HuggingFace dataset, organized by [agent_identifier]/YYYY.MM.DD.jsonl.
521
+ Each file is stored in the agent's folder and named YYYY.MM.DD.jsonl for that day's PRs.
522
+
523
+ This function APPENDS new metadata and DEDUPLICATES by html_url.
524
+
525
+ Args:
526
+ metadata_list: List of PR metadata dictionaries
527
+ agent_identifier: GitHub identifier of the agent (used as folder name)
528
+ """
529
+ try:
530
+ token = get_hf_token()
531
+ if not token:
532
+ raise Exception("No HuggingFace token found")
533
+
534
+ api = HfApi()
535
+
536
+ # Group by exact date (year, month, day)
537
+ grouped = group_metadata_by_date(metadata_list)
538
+
539
+ for (pr_year, month, day), day_metadata in grouped.items():
540
+ # New structure: [agent_identifier]/YYYY.MM.DD.jsonl
541
+ filename = f"{agent_identifier}/{pr_year}.{month:02d}.{day:02d}.jsonl"
542
+ local_filename = f"{pr_year}.{month:02d}.{day:02d}.jsonl"
543
+ print(f"📤 Uploading {len(day_metadata)} PRs to {filename}...")
544
+
545
+ # Download existing file if it exists
546
+ existing_metadata = []
547
+ try:
548
+ file_path = hf_hub_download(
549
+ repo_id=PR_METADATA_REPO,
550
+ filename=filename,
551
+ repo_type="dataset",
552
+ token=token
553
+ )
554
+ existing_metadata = load_jsonl(file_path)
555
+ print(f" Found {len(existing_metadata)} existing PRs in {filename}")
556
+ except Exception:
557
+ print(f" No existing file found for {filename}, creating new")
558
+
559
+ # Merge and deduplicate by html_url
560
+ existing_by_url = {meta['html_url']: meta for meta in existing_metadata if meta.get('html_url')}
561
+ new_by_url = {meta['html_url']: meta for meta in day_metadata if meta.get('html_url')}
562
+
563
+ # Update with new data (new data overwrites old)
564
+ existing_by_url.update(new_by_url)
565
+ merged_metadata = list(existing_by_url.values())
566
+
567
+ # Save locally
568
+ save_jsonl(local_filename, merged_metadata)
569
+
570
+ try:
571
+ # Upload to HuggingFace with folder path
572
+ upload_with_retry(
573
+ api=api,
574
+ path_or_fileobj=local_filename,
575
+ path_in_repo=filename,
576
+ repo_id=PR_METADATA_REPO,
577
+ repo_type="dataset",
578
+ token=token
579
+ )
580
+ print(f" ✓ Saved {len(merged_metadata)} total PRs to {filename}")
581
+ finally:
582
+ # Always clean up local file, even if upload fails
583
+ if os.path.exists(local_filename):
584
+ os.remove(local_filename)
585
+
586
+ return True
587
+
588
+ except Exception as e:
589
+ print(f"✗ Error saving PR metadata: {str(e)}")
590
+ return False
591
+
592
+
593
+ def load_agents_from_hf():
594
+ """Load all agent metadata JSON files from HuggingFace dataset."""
595
+ try:
596
+ api = HfApi()
597
+ agents = []
598
+
599
+ # List all files in the repository
600
+ files = api.list_repo_files(repo_id=AGENTS_REPO, repo_type="dataset")
601
+
602
+ # Filter for JSON files only
603
+ json_files = [f for f in files if f.endswith('.json')]
604
+
605
+ print(f"Found {len(json_files)} agent files in {AGENTS_REPO}")
606
+
607
+ # Download and parse each JSON file
608
+ for json_file in json_files:
609
+ try:
610
+ file_path = hf_hub_download(
611
+ repo_id=AGENTS_REPO,
612
+ filename=json_file,
613
+ repo_type="dataset"
614
+ )
615
+
616
+ with open(file_path, 'r') as f:
617
+ agent_data = json.load(f)
618
+ agents.append(agent_data)
619
+
620
+ except Exception as e:
621
+ print(f"Warning: Could not load {json_file}: {str(e)}")
622
+ continue
623
+
624
+ print(f"✓ Loaded {len(agents)} agents from HuggingFace")
625
+ return agents
626
+
627
+ except Exception as e:
628
+ print(f"Could not load agents from HuggingFace: {str(e)}")
629
+ return []
630
+
631
+
632
+ # =============================================================================
633
+ # MAIN MINING FUNCTION
634
+ # =============================================================================
635
+
636
+ def mine_all_agents():
637
+ """
638
+ Mine PR metadata for all agents within LEADERBOARD_TIME_FRAME_DAYS and save to HuggingFace.
639
+ """
640
+ token = get_github_token()
641
+
642
+ # Load agent metadata from HuggingFace
643
+ agents = load_agents_from_hf()
644
+ if not agents:
645
+ print("No agents found in HuggingFace dataset")
646
+ return
647
+
648
+ print(f"\n{'='*80}")
649
+ print(f"Starting PR metadata mining for {len(agents)} agents")
650
+ print(f"Time frame: Last {LEADERBOARD_TIME_FRAME_DAYS} days")
651
+ print(f"{'='*80}\n")
652
+
653
+ # Mine each agent
654
+ for agent in agents:
655
+ identifier = agent.get('github_identifier')
656
+ agent_name = agent.get('agent_name', 'Unknown')
657
+
658
+ if not identifier:
659
+ print(f"Warning: Skipping agent without identifier: {agent}")
660
+ continue
661
+
662
+ try:
663
+ print(f"\n{'='*80}")
664
+ print(f"Processing: {agent_name} ({identifier})")
665
+ print(f"{'='*80}")
666
+
667
+ # Fetch PR metadata
668
+ metadata = fetch_all_prs_metadata(identifier, agent_name, token)
669
+
670
+ if metadata:
671
+ print(f"💾 Saving {len(metadata)} PR records...")
672
+ save_pr_metadata_to_hf(metadata, identifier)
673
+ print(f"✓ Successfully processed {agent_name}")
674
+ else:
675
+ print(f" No PRs found for {agent_name}")
676
+
677
+ except Exception as e:
678
+ print(f"✗ Error processing {identifier}: {str(e)}")
679
+ import traceback
680
+ traceback.print_exc()
681
+ continue
682
+
683
+ print(f"\n{'='*80}")
684
+ print(f"✅ Mining complete for all agents")
685
+ print(f"{'='*80}\n")
686
+
687
+
688
+ # =============================================================================
689
+ # ENTRY POINT
690
+ # =============================================================================
691
+
692
+ if __name__ == "__main__":
693
+ mine_all_agents()