zhiminy commited on
Commit
48dc738
·
1 Parent(s): d7fcb0a
Files changed (2) hide show
  1. app.py +74 -11
  2. msr.py +74 -12
app.py CHANGED
@@ -105,6 +105,61 @@ def parse_date_string(date_string):
105
  # BIGQUERY FUNCTIONS
106
  # =============================================================================
107
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
108
  def get_bigquery_client():
109
  """
110
  Initialize BigQuery client using credentials from environment variable.
@@ -161,7 +216,10 @@ def generate_table_union_statements(start_date, end_date):
161
 
162
  def fetch_all_pr_metadata_single_query(client, identifiers, start_date, end_date):
163
  """
164
- Fetch PR metadata for ALL agents using ONE comprehensive BigQuery query.
 
 
 
165
 
166
  This query fetches PRs authored by agents (user.login matches identifier).
167
 
@@ -174,7 +232,7 @@ def fetch_all_pr_metadata_single_query(client, identifiers, start_date, end_date
174
  Returns:
175
  Dictionary mapping agent identifier to list of PR metadata
176
  """
177
- print(f"\n🔍 Querying BigQuery for ALL {len(identifiers)} agents in ONE QUERY")
178
  print(f" Time range: {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}")
179
 
180
  # Generate table UNION statements for the time range
@@ -228,14 +286,14 @@ def fetch_all_pr_metadata_single_query(client, identifiers, start_date, end_date
228
  ORDER BY created_at DESC
229
  """
230
 
231
- print(f" Querying {(end_date - start_date).days} days of GitHub Archive data...")
232
- print(f" Agents: {', '.join(identifiers[:5])}{'...' if len(identifiers) > 5 else ''}")
233
 
234
  try:
235
  query_job = client.query(query)
236
  results = list(query_job.result())
237
 
238
- print(f" ✓ Found {len(results)} total PRs across all agents")
239
 
240
  # Group results by agent
241
  metadata_by_agent = defaultdict(list)
@@ -266,8 +324,8 @@ def fetch_all_pr_metadata_single_query(client, identifiers, start_date, end_date
266
  if pr_author and pr_author in identifiers:
267
  metadata_by_agent[pr_author].append(pr_data)
268
 
269
- # Print breakdown by agent
270
- print(f"\n 📊 Results breakdown by agent:")
271
  for identifier in identifiers:
272
  count = len(metadata_by_agent.get(identifier, []))
273
  if count > 0:
@@ -989,7 +1047,7 @@ def mine_all_agents():
989
  print(f"\n{'='*80}")
990
  print(f"Starting PR metadata mining for {len(identifiers)} agents")
991
  print(f"Time frame: Last {UPDATE_TIME_FRAME_DAYS} days")
992
- print(f"Data source: BigQuery + GitHub Archive (ONE QUERY FOR ALL AGENTS)")
993
  print(f"{'='*80}\n")
994
 
995
  # Initialize BigQuery client
@@ -1005,8 +1063,9 @@ def mine_all_agents():
1005
  start_date = end_date - timedelta(days=UPDATE_TIME_FRAME_DAYS)
1006
 
1007
  try:
1008
- all_metadata = fetch_all_pr_metadata_single_query(
1009
- client, identifiers, start_date, end_date
 
1010
  )
1011
  except Exception as e:
1012
  print(f"✗ Error during BigQuery fetch: {str(e)}")
@@ -1054,13 +1113,17 @@ def mine_all_agents():
1054
  error_count += 1
1055
  continue
1056
 
 
 
 
 
1057
  print(f"\n{'='*80}")
1058
  print(f"✅ Mining complete!")
1059
  print(f" Total agents: {len(agents)}")
1060
  print(f" Successfully saved: {success_count}")
1061
  print(f" No data (skipped): {no_data_count}")
1062
  print(f" Errors: {error_count}")
1063
- print(f" BigQuery queries executed: 1")
1064
  print(f"{'='*80}\n")
1065
 
1066
  # After mining is complete, save leaderboard and metrics to HuggingFace
 
105
  # BIGQUERY FUNCTIONS
106
  # =============================================================================
107
 
108
+ def fetch_issue_metadata_batched(client, identifiers, start_date, end_date, batch_size=100):
109
+ """
110
+ Fetch issue metadata for ALL agents using BATCHED BigQuery queries.
111
+ Splits agents into smaller batches to avoid performance issues with large numbers of agents.
112
+
113
+ Args:
114
+ client: BigQuery client instance
115
+ identifiers: List of GitHub usernames/bot identifiers
116
+ start_date: Start datetime (timezone-aware)
117
+ end_date: End datetime (timezone-aware)
118
+ batch_size: Number of agents to process per batch (default: 100)
119
+
120
+ Returns:
121
+ Dictionary mapping agent identifier to list of issue metadata
122
+ """
123
+ # Split identifiers into batches
124
+ batches = [identifiers[i:i + batch_size] for i in range(0, len(identifiers), batch_size)]
125
+ total_batches = len(batches)
126
+
127
+ print(f"\n🔍 Using BATCHED approach for {len(identifiers)} agents")
128
+ print(f" Total batches: {total_batches} (batch size: {batch_size})")
129
+ print(f" Time range: {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}")
130
+
131
+ # Collect results from all batches
132
+ all_metadata = {}
133
+
134
+ for batch_num, batch_identifiers in enumerate(batches, 1):
135
+ print(f"\n📦 Processing batch {batch_num}/{total_batches} ({len(batch_identifiers)} agents)...")
136
+
137
+ try:
138
+ # Query each batch
139
+ batch_results = fetch_all_pr_metadata_single_query(
140
+ client, batch_identifiers, start_date, end_date
141
+ )
142
+
143
+ # Merge results
144
+ for identifier, metadata_list in batch_results.items():
145
+ if identifier in all_metadata:
146
+ all_metadata[identifier].extend(metadata_list)
147
+ else:
148
+ all_metadata[identifier] = metadata_list
149
+
150
+ print(f" ✓ Batch {batch_num}/{total_batches} complete")
151
+
152
+ except Exception as e:
153
+ print(f" ✗ Batch {batch_num}/{total_batches} failed: {str(e)}")
154
+ print(f" Continuing with remaining batches...")
155
+ continue
156
+
157
+ total_prs = sum(len(metadata_list) for metadata_list in all_metadata.values())
158
+ print(f"\n✓ All batches complete! Found {total_prs} total PRs across {len(all_metadata)} agents")
159
+
160
+ return all_metadata
161
+
162
+
163
  def get_bigquery_client():
164
  """
165
  Initialize BigQuery client using credentials from environment variable.
 
216
 
217
  def fetch_all_pr_metadata_single_query(client, identifiers, start_date, end_date):
218
  """
219
+ Fetch PR metadata for a BATCH of agents using ONE comprehensive BigQuery query.
220
+
221
+ NOTE: This function is designed for smaller batches (~100 agents).
222
+ For large numbers of agents, use fetch_issue_metadata_batched() instead.
223
 
224
  This query fetches PRs authored by agents (user.login matches identifier).
225
 
 
232
  Returns:
233
  Dictionary mapping agent identifier to list of PR metadata
234
  """
235
+ print(f" Querying BigQuery for {len(identifiers)} agents in this batch...")
236
  print(f" Time range: {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}")
237
 
238
  # Generate table UNION statements for the time range
 
286
  ORDER BY created_at DESC
287
  """
288
 
289
+ print(f" Scanning {(end_date - start_date).days} days of GitHub Archive data...")
290
+ print(f" Batch agents: {', '.join(identifiers[:5])}{'...' if len(identifiers) > 5 else ''}")
291
 
292
  try:
293
  query_job = client.query(query)
294
  results = list(query_job.result())
295
 
296
+ print(f" ✓ Found {len(results)} PRs in this batch")
297
 
298
  # Group results by agent
299
  metadata_by_agent = defaultdict(list)
 
324
  if pr_author and pr_author in identifiers:
325
  metadata_by_agent[pr_author].append(pr_data)
326
 
327
+ # Print breakdown by agent (only show agents with PRs)
328
+ print(f" 📊 Batch breakdown:")
329
  for identifier in identifiers:
330
  count = len(metadata_by_agent.get(identifier, []))
331
  if count > 0:
 
1047
  print(f"\n{'='*80}")
1048
  print(f"Starting PR metadata mining for {len(identifiers)} agents")
1049
  print(f"Time frame: Last {UPDATE_TIME_FRAME_DAYS} days")
1050
+ print(f"Data source: BigQuery + GitHub Archive (BATCHED QUERIES)")
1051
  print(f"{'='*80}\n")
1052
 
1053
  # Initialize BigQuery client
 
1063
  start_date = end_date - timedelta(days=UPDATE_TIME_FRAME_DAYS)
1064
 
1065
  try:
1066
+ # Use batched approach for better performance
1067
+ all_metadata = fetch_issue_metadata_batched(
1068
+ client, identifiers, start_date, end_date, batch_size=100
1069
  )
1070
  except Exception as e:
1071
  print(f"✗ Error during BigQuery fetch: {str(e)}")
 
1113
  error_count += 1
1114
  continue
1115
 
1116
+ # Calculate number of batches
1117
+ batch_size = 100
1118
+ total_batches = (len(identifiers) + batch_size - 1) // batch_size
1119
+
1120
  print(f"\n{'='*80}")
1121
  print(f"✅ Mining complete!")
1122
  print(f" Total agents: {len(agents)}")
1123
  print(f" Successfully saved: {success_count}")
1124
  print(f" No data (skipped): {no_data_count}")
1125
  print(f" Errors: {error_count}")
1126
+ print(f" BigQuery batches executed: {total_batches} (batch size: {batch_size})")
1127
  print(f"{'='*80}\n")
1128
 
1129
  # After mining is complete, save leaderboard and metrics to HuggingFace
msr.py CHANGED
@@ -118,13 +118,70 @@ def get_hf_token():
118
  # BIGQUERY FUNCTIONS
119
  # =============================================================================
120
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
121
  def fetch_all_pr_metadata_single_query(client, identifiers, start_date, end_date):
122
  """
123
- Fetch PR metadata for ALL agents using ONE comprehensive BigQuery query.
 
 
 
124
 
125
  This query fetches:
126
  1. PRs authored by agents (user.login matches identifier)
127
- 2. PRs from branches starting with agent identifier (head.ref pattern)
128
 
129
  Args:
130
  client: BigQuery client instance
@@ -147,7 +204,7 @@ def fetch_all_pr_metadata_single_query(client, identifiers, start_date, end_date
147
  ...
148
  }
149
  """
150
- print(f"\n🔍 Querying BigQuery for ALL {len(identifiers)} agents in ONE QUERY")
151
  print(f" Time range: {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}")
152
 
153
  # Generate table UNION statements for the time range
@@ -201,14 +258,14 @@ def fetch_all_pr_metadata_single_query(client, identifiers, start_date, end_date
201
  ORDER BY created_at DESC
202
  """
203
 
204
- print(f" Querying {(end_date - start_date).days} days of GitHub Archive data...")
205
- print(f" Agents: {', '.join(identifiers[:5])}{'...' if len(identifiers) > 5 else ''}")
206
 
207
  try:
208
  query_job = client.query(query)
209
  results = list(query_job.result())
210
 
211
- print(f" ✓ Found {len(results)} total PRs across all agents")
212
 
213
  # Group results by agent
214
  metadata_by_agent = defaultdict(list)
@@ -239,8 +296,8 @@ def fetch_all_pr_metadata_single_query(client, identifiers, start_date, end_date
239
  if pr_author and pr_author in identifiers:
240
  metadata_by_agent[pr_author].append(pr_data)
241
 
242
- # Print breakdown by agent
243
- print(f"\n 📊 Results breakdown by agent:")
244
  for identifier in identifiers:
245
  count = len(metadata_by_agent.get(identifier, []))
246
  if count > 0:
@@ -726,7 +783,7 @@ def mine_all_agents():
726
  print(f"\n{'='*80}")
727
  print(f"Starting PR metadata mining for {len(identifiers)} agents")
728
  print(f"Time frame: Last {LEADERBOARD_TIME_FRAME_DAYS} days")
729
- print(f"Data source: BigQuery + GitHub Archive (ONE QUERY FOR ALL AGENTS)")
730
  print(f"{'='*80}\n")
731
 
732
  # Initialize BigQuery client
@@ -742,8 +799,9 @@ def mine_all_agents():
742
  start_date = end_date - timedelta(days=LEADERBOARD_TIME_FRAME_DAYS)
743
 
744
  try:
745
- all_metadata = fetch_all_pr_metadata_single_query(
746
- client, identifiers, start_date, end_date
 
747
  )
748
  except Exception as e:
749
  print(f"✗ Error during BigQuery fetch: {str(e)}")
@@ -791,13 +849,17 @@ def mine_all_agents():
791
  error_count += 1
792
  continue
793
 
 
 
 
 
794
  print(f"\n{'='*80}")
795
  print(f"✅ Mining complete!")
796
  print(f" Total agents: {len(agents)}")
797
  print(f" Successfully saved: {success_count}")
798
  print(f" No data (skipped): {no_data_count}")
799
  print(f" Errors: {error_count}")
800
- print(f" BigQuery queries executed: 1")
801
  print(f"{'='*80}\n")
802
 
803
  # Compute and save leaderboard data
 
118
  # BIGQUERY FUNCTIONS
119
  # =============================================================================
120
 
121
+ def fetch_issue_metadata_batched(client, identifiers, start_date, end_date, batch_size=100):
122
+ """
123
+ Fetch issue metadata for ALL agents using BATCHED BigQuery queries.
124
+ Splits agents into smaller batches to avoid performance issues with large numbers of agents.
125
+
126
+ Args:
127
+ client: BigQuery client instance
128
+ identifiers: List of GitHub usernames/bot identifiers
129
+ start_date: Start datetime (timezone-aware)
130
+ end_date: End datetime (timezone-aware)
131
+ batch_size: Number of agents to process per batch (default: 100)
132
+
133
+ Returns:
134
+ Dictionary mapping agent identifier to list of issue metadata
135
+ """
136
+ # Split identifiers into batches
137
+ batches = [identifiers[i:i + batch_size] for i in range(0, len(identifiers), batch_size)]
138
+ total_batches = len(batches)
139
+
140
+ print(f"\n🔍 Using BATCHED approach for {len(identifiers)} agents")
141
+ print(f" Total batches: {total_batches} (batch size: {batch_size})")
142
+ print(f" Time range: {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}")
143
+
144
+ # Collect results from all batches
145
+ all_metadata = {}
146
+
147
+ for batch_num, batch_identifiers in enumerate(batches, 1):
148
+ print(f"\n📦 Processing batch {batch_num}/{total_batches} ({len(batch_identifiers)} agents)...")
149
+
150
+ try:
151
+ # Query each batch
152
+ batch_results = fetch_all_pr_metadata_single_query(
153
+ client, batch_identifiers, start_date, end_date
154
+ )
155
+
156
+ # Merge results
157
+ for identifier, metadata_list in batch_results.items():
158
+ if identifier in all_metadata:
159
+ all_metadata[identifier].extend(metadata_list)
160
+ else:
161
+ all_metadata[identifier] = metadata_list
162
+
163
+ print(f" ✓ Batch {batch_num}/{total_batches} complete")
164
+
165
+ except Exception as e:
166
+ print(f" ✗ Batch {batch_num}/{total_batches} failed: {str(e)}")
167
+ print(f" Continuing with remaining batches...")
168
+ continue
169
+
170
+ total_prs = sum(len(metadata_list) for metadata_list in all_metadata.values())
171
+ print(f"\n✓ All batches complete! Found {total_prs} total PRs across {len(all_metadata)} agents")
172
+
173
+ return all_metadata
174
+
175
+
176
  def fetch_all_pr_metadata_single_query(client, identifiers, start_date, end_date):
177
  """
178
+ Fetch PR metadata for a BATCH of agents using ONE comprehensive BigQuery query.
179
+
180
+ NOTE: This function is designed for smaller batches (~100 agents).
181
+ For large numbers of agents, use fetch_issue_metadata_batched() instead.
182
 
183
  This query fetches:
184
  1. PRs authored by agents (user.login matches identifier)
 
185
 
186
  Args:
187
  client: BigQuery client instance
 
204
  ...
205
  }
206
  """
207
+ print(f" Querying BigQuery for {len(identifiers)} agents in this batch...")
208
  print(f" Time range: {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}")
209
 
210
  # Generate table UNION statements for the time range
 
258
  ORDER BY created_at DESC
259
  """
260
 
261
+ print(f" Scanning {(end_date - start_date).days} days of GitHub Archive data...")
262
+ print(f" Batch agents: {', '.join(identifiers[:5])}{'...' if len(identifiers) > 5 else ''}")
263
 
264
  try:
265
  query_job = client.query(query)
266
  results = list(query_job.result())
267
 
268
+ print(f" ✓ Found {len(results)} PRs in this batch")
269
 
270
  # Group results by agent
271
  metadata_by_agent = defaultdict(list)
 
296
  if pr_author and pr_author in identifiers:
297
  metadata_by_agent[pr_author].append(pr_data)
298
 
299
+ # Print breakdown by agent (only show agents with PRs)
300
+ print(f" 📊 Batch breakdown:")
301
  for identifier in identifiers:
302
  count = len(metadata_by_agent.get(identifier, []))
303
  if count > 0:
 
783
  print(f"\n{'='*80}")
784
  print(f"Starting PR metadata mining for {len(identifiers)} agents")
785
  print(f"Time frame: Last {LEADERBOARD_TIME_FRAME_DAYS} days")
786
+ print(f"Data source: BigQuery + GitHub Archive (BATCHED QUERIES)")
787
  print(f"{'='*80}\n")
788
 
789
  # Initialize BigQuery client
 
799
  start_date = end_date - timedelta(days=LEADERBOARD_TIME_FRAME_DAYS)
800
 
801
  try:
802
+ # Use batched approach for better performance
803
+ all_metadata = fetch_issue_metadata_batched(
804
+ client, identifiers, start_date, end_date, batch_size=100
805
  )
806
  except Exception as e:
807
  print(f"✗ Error during BigQuery fetch: {str(e)}")
 
849
  error_count += 1
850
  continue
851
 
852
+ # Calculate number of batches
853
+ batch_size = 100
854
+ total_batches = (len(identifiers) + batch_size - 1) // batch_size
855
+
856
  print(f"\n{'='*80}")
857
  print(f"✅ Mining complete!")
858
  print(f" Total agents: {len(agents)}")
859
  print(f" Successfully saved: {success_count}")
860
  print(f" No data (skipped): {no_data_count}")
861
  print(f" Errors: {error_count}")
862
+ print(f" BigQuery batches executed: {total_batches} (batch size: {batch_size})")
863
  print(f"{'='*80}\n")
864
 
865
  # Compute and save leaderboard data