zhiminy commited on
Commit
5d479fd
·
1 Parent(s): b784bb3

refine workflow

Browse files
Files changed (2) hide show
  1. app.py +35 -55
  2. msr.py +36 -56
app.py CHANGED
@@ -177,7 +177,7 @@ def upload_file_with_backoff(api, **kwargs):
177
  # BIGQUERY FUNCTIONS
178
  # =============================================================================
179
 
180
- def fetch_issue_metadata_batched(client, identifiers, start_date, end_date, batch_size=50):
181
  """
182
  Fetch issue metadata for ALL agents using BATCHED BigQuery queries.
183
  Splits agents into smaller batches to avoid performance issues with large numbers of agents.
@@ -188,6 +188,7 @@ def fetch_issue_metadata_batched(client, identifiers, start_date, end_date, batc
188
  start_date: Start datetime (timezone-aware)
189
  end_date: End datetime (timezone-aware)
190
  batch_size: Number of agents to process per batch (default: 50)
 
191
 
192
  Returns:
193
  Dictionary mapping agent identifier to list of issue metadata
@@ -199,6 +200,10 @@ def fetch_issue_metadata_batched(client, identifiers, start_date, end_date, batc
199
  print(f"\n🔍 Using BATCHED approach for {len(identifiers)} agents")
200
  print(f" Total batches: {total_batches} (batch size: {batch_size})")
201
  print(f" Time range: {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}")
 
 
 
 
202
 
203
  # Collect results from all batches
204
  all_metadata = {}
@@ -221,6 +226,21 @@ def fetch_issue_metadata_batched(client, identifiers, start_date, end_date, batc
221
 
222
  print(f" ✓ Batch {batch_num}/{total_batches} complete")
223
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
224
  except Exception as e:
225
  print(f" ✗ Batch {batch_num}/{total_batches} failed: {str(e)}")
226
  print(f" Continuing with remaining batches...")
@@ -1142,68 +1162,28 @@ def mine_all_agents():
1142
 
1143
  try:
1144
  # Use batched approach for better performance
 
1145
  all_metadata = fetch_issue_metadata_batched(
1146
- client, identifiers, start_date, end_date, batch_size=50
1147
  )
 
 
 
 
 
 
 
 
 
 
 
 
1148
  except Exception as e:
1149
  print(f"✗ Error during BigQuery fetch: {str(e)}")
1150
  import traceback
1151
  traceback.print_exc()
1152
  return
1153
 
1154
- # Save results for each agent
1155
- print(f"\n{'='*80}")
1156
- print(f"💾 Saving results to HuggingFace for each agent...")
1157
- print(f"{'='*80}\n")
1158
-
1159
- success_count = 0
1160
- error_count = 0
1161
- no_data_count = 0
1162
-
1163
- for i, agent in enumerate(agents, 1):
1164
- identifier = agent.get('github_identifier')
1165
- agent_name = agent.get('name', 'Unknown')
1166
-
1167
- if not identifier:
1168
- print(f"[{i}/{len(agents)}] Skipping agent without identifier")
1169
- error_count += 1
1170
- continue
1171
-
1172
- metadata = all_metadata.get(identifier, [])
1173
-
1174
- print(f"[{i}/{len(agents)}] {agent_name} ({identifier}):")
1175
-
1176
- try:
1177
- if metadata:
1178
- print(f" 💾 Saving {len(metadata)} PR records...")
1179
- if save_pr_metadata_to_hf(metadata, identifier):
1180
- success_count += 1
1181
- else:
1182
- error_count += 1
1183
- else:
1184
- print(f" No PRs found")
1185
- no_data_count += 1
1186
-
1187
- except Exception as e:
1188
- print(f" ✗ Error saving {identifier}: {str(e)}")
1189
- import traceback
1190
- traceback.print_exc()
1191
- error_count += 1
1192
- continue
1193
-
1194
- # Calculate number of batches
1195
- batch_size = 50
1196
- total_batches = (len(identifiers) + batch_size - 1) // batch_size
1197
-
1198
- print(f"\n{'='*80}")
1199
- print(f"✅ Mining complete!")
1200
- print(f" Total agents: {len(agents)}")
1201
- print(f" Successfully saved: {success_count}")
1202
- print(f" No data (skipped): {no_data_count}")
1203
- print(f" Errors: {error_count}")
1204
- print(f" BigQuery batches executed: {total_batches} (batch size: {batch_size})")
1205
- print(f"{'='*80}\n")
1206
-
1207
  # After mining is complete, save leaderboard and metrics to HuggingFace
1208
  print(f"📤 Uploading leaderboard and metrics data...")
1209
  if save_leaderboard_and_metrics_to_hf():
 
177
  # BIGQUERY FUNCTIONS
178
  # =============================================================================
179
 
180
+ def fetch_issue_metadata_batched(client, identifiers, start_date, end_date, batch_size=50, upload_immediately=True):
181
  """
182
  Fetch issue metadata for ALL agents using BATCHED BigQuery queries.
183
  Splits agents into smaller batches to avoid performance issues with large numbers of agents.
 
188
  start_date: Start datetime (timezone-aware)
189
  end_date: End datetime (timezone-aware)
190
  batch_size: Number of agents to process per batch (default: 50)
191
+ upload_immediately: If True, upload each batch's results to HuggingFace immediately (default: True)
192
 
193
  Returns:
194
  Dictionary mapping agent identifier to list of issue metadata
 
200
  print(f"\n🔍 Using BATCHED approach for {len(identifiers)} agents")
201
  print(f" Total batches: {total_batches} (batch size: {batch_size})")
202
  print(f" Time range: {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}")
203
+ if upload_immediately:
204
+ print(f" Upload mode: Immediate (after each batch)")
205
+ else:
206
+ print(f" Upload mode: Deferred (all at once)")
207
 
208
  # Collect results from all batches
209
  all_metadata = {}
 
226
 
227
  print(f" ✓ Batch {batch_num}/{total_batches} complete")
228
 
229
+ # Upload immediately after this batch if enabled
230
+ if upload_immediately and batch_results:
231
+ print(f"\n 📤 Uploading batch {batch_num}/{total_batches} results to HuggingFace...")
232
+ upload_success = 0
233
+ upload_errors = 0
234
+
235
+ for identifier, metadata_list in batch_results.items():
236
+ if metadata_list:
237
+ if save_pr_metadata_to_hf(metadata_list, identifier):
238
+ upload_success += 1
239
+ else:
240
+ upload_errors += 1
241
+
242
+ print(f" ✓ Batch {batch_num}/{total_batches} upload complete ({upload_success} agents uploaded, {upload_errors} errors)")
243
+
244
  except Exception as e:
245
  print(f" ✗ Batch {batch_num}/{total_batches} failed: {str(e)}")
246
  print(f" Continuing with remaining batches...")
 
1162
 
1163
  try:
1164
  # Use batched approach for better performance
1165
+ # upload_immediately=True means each batch uploads to HuggingFace right after BigQuery completes
1166
  all_metadata = fetch_issue_metadata_batched(
1167
+ client, identifiers, start_date, end_date, batch_size=50, upload_immediately=True
1168
  )
1169
+
1170
+ # Calculate summary statistics
1171
+ total_prs = sum(len(metadata_list) for metadata_list in all_metadata.values())
1172
+ agents_with_data = sum(1 for metadata_list in all_metadata.values() if metadata_list)
1173
+
1174
+ print(f"\n{'='*80}")
1175
+ print(f"✅ BigQuery mining and upload complete!")
1176
+ print(f" Total agents: {len(agents)}")
1177
+ print(f" Agents with data: {agents_with_data}")
1178
+ print(f" Total PRs found: {total_prs}")
1179
+ print(f"{'='*80}\n")
1180
+
1181
  except Exception as e:
1182
  print(f"✗ Error during BigQuery fetch: {str(e)}")
1183
  import traceback
1184
  traceback.print_exc()
1185
  return
1186
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1187
  # After mining is complete, save leaderboard and metrics to HuggingFace
1188
  print(f"📤 Uploading leaderboard and metrics data...")
1189
  if save_leaderboard_and_metrics_to_hf():
msr.py CHANGED
@@ -190,7 +190,7 @@ def upload_file_with_backoff(api, **kwargs):
190
  # BIGQUERY FUNCTIONS
191
  # =============================================================================
192
 
193
- def fetch_issue_metadata_batched(client, identifiers, start_date, end_date, batch_size=50):
194
  """
195
  Fetch issue metadata for ALL agents using BATCHED BigQuery queries.
196
  Splits agents into smaller batches to avoid performance issues with large numbers of agents.
@@ -200,7 +200,8 @@ def fetch_issue_metadata_batched(client, identifiers, start_date, end_date, batc
200
  identifiers: List of GitHub usernames/bot identifiers
201
  start_date: Start datetime (timezone-aware)
202
  end_date: End datetime (timezone-aware)
203
- batch_size: Number of agents to process per batch (default: 100)
 
204
 
205
  Returns:
206
  Dictionary mapping agent identifier to list of issue metadata
@@ -212,6 +213,10 @@ def fetch_issue_metadata_batched(client, identifiers, start_date, end_date, batc
212
  print(f"\n🔍 Using BATCHED approach for {len(identifiers)} agents")
213
  print(f" Total batches: {total_batches} (batch size: {batch_size})")
214
  print(f" Time range: {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}")
 
 
 
 
215
 
216
  # Collect results from all batches
217
  all_metadata = {}
@@ -234,6 +239,21 @@ def fetch_issue_metadata_batched(client, identifiers, start_date, end_date, batc
234
 
235
  print(f" ✓ Batch {batch_num}/{total_batches} complete")
236
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
237
  except Exception as e:
238
  print(f" ✗ Batch {batch_num}/{total_batches} failed: {str(e)}")
239
  print(f" Continuing with remaining batches...")
@@ -875,68 +895,28 @@ def mine_all_agents():
875
 
876
  try:
877
  # Use batched approach for better performance
 
878
  all_metadata = fetch_issue_metadata_batched(
879
- client, identifiers, start_date, end_date, batch_size=50
880
  )
 
 
 
 
 
 
 
 
 
 
 
 
881
  except Exception as e:
882
  print(f"✗ Error during BigQuery fetch: {str(e)}")
883
  import traceback
884
  traceback.print_exc()
885
  return
886
 
887
- # Save results for each agent
888
- print(f"\n{'='*80}")
889
- print(f"💾 Saving results to HuggingFace for each agent...")
890
- print(f"{'='*80}\n")
891
-
892
- success_count = 0
893
- error_count = 0
894
- no_data_count = 0
895
-
896
- for i, agent in enumerate(agents, 1):
897
- identifier = agent.get('github_identifier')
898
- agent_name = agent.get('name', 'Unknown')
899
-
900
- if not identifier:
901
- print(f"[{i}/{len(agents)}] Skipping agent without identifier")
902
- error_count += 1
903
- continue
904
-
905
- metadata = all_metadata.get(identifier, [])
906
-
907
- print(f"[{i}/{len(agents)}] {agent_name} ({identifier}):")
908
-
909
- try:
910
- if metadata:
911
- print(f" 💾 Saving {len(metadata)} PR records...")
912
- if save_pr_metadata_to_hf(metadata, identifier):
913
- success_count += 1
914
- else:
915
- error_count += 1
916
- else:
917
- print(f" No PRs found")
918
- no_data_count += 1
919
-
920
- except Exception as e:
921
- print(f" ✗ Error saving {identifier}: {str(e)}")
922
- import traceback
923
- traceback.print_exc()
924
- error_count += 1
925
- continue
926
-
927
- # Calculate number of batches
928
- batch_size = 50
929
- total_batches = (len(identifiers) + batch_size - 1) // batch_size
930
-
931
- print(f"\n{'='*80}")
932
- print(f"✅ Mining complete!")
933
- print(f" Total agents: {len(agents)}")
934
- print(f" Successfully saved: {success_count}")
935
- print(f" No data (skipped): {no_data_count}")
936
- print(f" Errors: {error_count}")
937
- print(f" BigQuery batches executed: {total_batches} (batch size: {batch_size})")
938
- print(f"{'='*80}\n")
939
-
940
  # Compute and save leaderboard data
941
  print(f"\n{'='*80}")
942
  print(f"📊 Computing leaderboard and monthly metrics...")
 
190
  # BIGQUERY FUNCTIONS
191
  # =============================================================================
192
 
193
+ def fetch_issue_metadata_batched(client, identifiers, start_date, end_date, batch_size=50, upload_immediately=True):
194
  """
195
  Fetch issue metadata for ALL agents using BATCHED BigQuery queries.
196
  Splits agents into smaller batches to avoid performance issues with large numbers of agents.
 
200
  identifiers: List of GitHub usernames/bot identifiers
201
  start_date: Start datetime (timezone-aware)
202
  end_date: End datetime (timezone-aware)
203
+ batch_size: Number of agents to process per batch (default: 50)
204
+ upload_immediately: If True, upload each batch's results to HuggingFace immediately (default: True)
205
 
206
  Returns:
207
  Dictionary mapping agent identifier to list of issue metadata
 
213
  print(f"\n🔍 Using BATCHED approach for {len(identifiers)} agents")
214
  print(f" Total batches: {total_batches} (batch size: {batch_size})")
215
  print(f" Time range: {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}")
216
+ if upload_immediately:
217
+ print(f" Upload mode: Immediate (after each batch)")
218
+ else:
219
+ print(f" Upload mode: Deferred (all at once)")
220
 
221
  # Collect results from all batches
222
  all_metadata = {}
 
239
 
240
  print(f" ✓ Batch {batch_num}/{total_batches} complete")
241
 
242
+ # Upload immediately after this batch if enabled
243
+ if upload_immediately and batch_results:
244
+ print(f"\n 📤 Uploading batch {batch_num}/{total_batches} results to HuggingFace...")
245
+ upload_success = 0
246
+ upload_errors = 0
247
+
248
+ for identifier, metadata_list in batch_results.items():
249
+ if metadata_list:
250
+ if save_pr_metadata_to_hf(metadata_list, identifier):
251
+ upload_success += 1
252
+ else:
253
+ upload_errors += 1
254
+
255
+ print(f" ✓ Batch {batch_num}/{total_batches} upload complete ({upload_success} agents uploaded, {upload_errors} errors)")
256
+
257
  except Exception as e:
258
  print(f" ✗ Batch {batch_num}/{total_batches} failed: {str(e)}")
259
  print(f" Continuing with remaining batches...")
 
895
 
896
  try:
897
  # Use batched approach for better performance
898
+ # upload_immediately=True means each batch uploads to HuggingFace right after BigQuery completes
899
  all_metadata = fetch_issue_metadata_batched(
900
+ client, identifiers, start_date, end_date, batch_size=50, upload_immediately=True
901
  )
902
+
903
+ # Calculate summary statistics
904
+ total_prs = sum(len(metadata_list) for metadata_list in all_metadata.values())
905
+ agents_with_data = sum(1 for metadata_list in all_metadata.values() if metadata_list)
906
+
907
+ print(f"\n{'='*80}")
908
+ print(f"✅ BigQuery mining and upload complete!")
909
+ print(f" Total agents: {len(agents)}")
910
+ print(f" Agents with data: {agents_with_data}")
911
+ print(f" Total PRs found: {total_prs}")
912
+ print(f"{'='*80}\n")
913
+
914
  except Exception as e:
915
  print(f"✗ Error during BigQuery fetch: {str(e)}")
916
  import traceback
917
  traceback.print_exc()
918
  return
919
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
920
  # Compute and save leaderboard data
921
  print(f"\n{'='*80}")
922
  print(f"📊 Computing leaderboard and monthly metrics...")