refine workflow
Browse files
app.py
CHANGED
|
@@ -177,7 +177,7 @@ def upload_file_with_backoff(api, **kwargs):
|
|
| 177 |
# BIGQUERY FUNCTIONS
|
| 178 |
# =============================================================================
|
| 179 |
|
| 180 |
-
def fetch_issue_metadata_batched(client, identifiers, start_date, end_date, batch_size=50):
|
| 181 |
"""
|
| 182 |
Fetch issue metadata for ALL agents using BATCHED BigQuery queries.
|
| 183 |
Splits agents into smaller batches to avoid performance issues with large numbers of agents.
|
|
@@ -188,6 +188,7 @@ def fetch_issue_metadata_batched(client, identifiers, start_date, end_date, batc
|
|
| 188 |
start_date: Start datetime (timezone-aware)
|
| 189 |
end_date: End datetime (timezone-aware)
|
| 190 |
batch_size: Number of agents to process per batch (default: 50)
|
|
|
|
| 191 |
|
| 192 |
Returns:
|
| 193 |
Dictionary mapping agent identifier to list of issue metadata
|
|
@@ -199,6 +200,10 @@ def fetch_issue_metadata_batched(client, identifiers, start_date, end_date, batc
|
|
| 199 |
print(f"\n🔍 Using BATCHED approach for {len(identifiers)} agents")
|
| 200 |
print(f" Total batches: {total_batches} (batch size: {batch_size})")
|
| 201 |
print(f" Time range: {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 202 |
|
| 203 |
# Collect results from all batches
|
| 204 |
all_metadata = {}
|
|
@@ -221,6 +226,21 @@ def fetch_issue_metadata_batched(client, identifiers, start_date, end_date, batc
|
|
| 221 |
|
| 222 |
print(f" ✓ Batch {batch_num}/{total_batches} complete")
|
| 223 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 224 |
except Exception as e:
|
| 225 |
print(f" ✗ Batch {batch_num}/{total_batches} failed: {str(e)}")
|
| 226 |
print(f" Continuing with remaining batches...")
|
|
@@ -1142,68 +1162,28 @@ def mine_all_agents():
|
|
| 1142 |
|
| 1143 |
try:
|
| 1144 |
# Use batched approach for better performance
|
|
|
|
| 1145 |
all_metadata = fetch_issue_metadata_batched(
|
| 1146 |
-
client, identifiers, start_date, end_date, batch_size=50
|
| 1147 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1148 |
except Exception as e:
|
| 1149 |
print(f"✗ Error during BigQuery fetch: {str(e)}")
|
| 1150 |
import traceback
|
| 1151 |
traceback.print_exc()
|
| 1152 |
return
|
| 1153 |
|
| 1154 |
-
# Save results for each agent
|
| 1155 |
-
print(f"\n{'='*80}")
|
| 1156 |
-
print(f"💾 Saving results to HuggingFace for each agent...")
|
| 1157 |
-
print(f"{'='*80}\n")
|
| 1158 |
-
|
| 1159 |
-
success_count = 0
|
| 1160 |
-
error_count = 0
|
| 1161 |
-
no_data_count = 0
|
| 1162 |
-
|
| 1163 |
-
for i, agent in enumerate(agents, 1):
|
| 1164 |
-
identifier = agent.get('github_identifier')
|
| 1165 |
-
agent_name = agent.get('name', 'Unknown')
|
| 1166 |
-
|
| 1167 |
-
if not identifier:
|
| 1168 |
-
print(f"[{i}/{len(agents)}] Skipping agent without identifier")
|
| 1169 |
-
error_count += 1
|
| 1170 |
-
continue
|
| 1171 |
-
|
| 1172 |
-
metadata = all_metadata.get(identifier, [])
|
| 1173 |
-
|
| 1174 |
-
print(f"[{i}/{len(agents)}] {agent_name} ({identifier}):")
|
| 1175 |
-
|
| 1176 |
-
try:
|
| 1177 |
-
if metadata:
|
| 1178 |
-
print(f" 💾 Saving {len(metadata)} PR records...")
|
| 1179 |
-
if save_pr_metadata_to_hf(metadata, identifier):
|
| 1180 |
-
success_count += 1
|
| 1181 |
-
else:
|
| 1182 |
-
error_count += 1
|
| 1183 |
-
else:
|
| 1184 |
-
print(f" No PRs found")
|
| 1185 |
-
no_data_count += 1
|
| 1186 |
-
|
| 1187 |
-
except Exception as e:
|
| 1188 |
-
print(f" ✗ Error saving {identifier}: {str(e)}")
|
| 1189 |
-
import traceback
|
| 1190 |
-
traceback.print_exc()
|
| 1191 |
-
error_count += 1
|
| 1192 |
-
continue
|
| 1193 |
-
|
| 1194 |
-
# Calculate number of batches
|
| 1195 |
-
batch_size = 50
|
| 1196 |
-
total_batches = (len(identifiers) + batch_size - 1) // batch_size
|
| 1197 |
-
|
| 1198 |
-
print(f"\n{'='*80}")
|
| 1199 |
-
print(f"✅ Mining complete!")
|
| 1200 |
-
print(f" Total agents: {len(agents)}")
|
| 1201 |
-
print(f" Successfully saved: {success_count}")
|
| 1202 |
-
print(f" No data (skipped): {no_data_count}")
|
| 1203 |
-
print(f" Errors: {error_count}")
|
| 1204 |
-
print(f" BigQuery batches executed: {total_batches} (batch size: {batch_size})")
|
| 1205 |
-
print(f"{'='*80}\n")
|
| 1206 |
-
|
| 1207 |
# After mining is complete, save leaderboard and metrics to HuggingFace
|
| 1208 |
print(f"📤 Uploading leaderboard and metrics data...")
|
| 1209 |
if save_leaderboard_and_metrics_to_hf():
|
|
|
|
| 177 |
# BIGQUERY FUNCTIONS
|
| 178 |
# =============================================================================
|
| 179 |
|
| 180 |
+
def fetch_issue_metadata_batched(client, identifiers, start_date, end_date, batch_size=50, upload_immediately=True):
|
| 181 |
"""
|
| 182 |
Fetch issue metadata for ALL agents using BATCHED BigQuery queries.
|
| 183 |
Splits agents into smaller batches to avoid performance issues with large numbers of agents.
|
|
|
|
| 188 |
start_date: Start datetime (timezone-aware)
|
| 189 |
end_date: End datetime (timezone-aware)
|
| 190 |
batch_size: Number of agents to process per batch (default: 50)
|
| 191 |
+
upload_immediately: If True, upload each batch's results to HuggingFace immediately (default: True)
|
| 192 |
|
| 193 |
Returns:
|
| 194 |
Dictionary mapping agent identifier to list of issue metadata
|
|
|
|
| 200 |
print(f"\n🔍 Using BATCHED approach for {len(identifiers)} agents")
|
| 201 |
print(f" Total batches: {total_batches} (batch size: {batch_size})")
|
| 202 |
print(f" Time range: {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}")
|
| 203 |
+
if upload_immediately:
|
| 204 |
+
print(f" Upload mode: Immediate (after each batch)")
|
| 205 |
+
else:
|
| 206 |
+
print(f" Upload mode: Deferred (all at once)")
|
| 207 |
|
| 208 |
# Collect results from all batches
|
| 209 |
all_metadata = {}
|
|
|
|
| 226 |
|
| 227 |
print(f" ✓ Batch {batch_num}/{total_batches} complete")
|
| 228 |
|
| 229 |
+
# Upload immediately after this batch if enabled
|
| 230 |
+
if upload_immediately and batch_results:
|
| 231 |
+
print(f"\n 📤 Uploading batch {batch_num}/{total_batches} results to HuggingFace...")
|
| 232 |
+
upload_success = 0
|
| 233 |
+
upload_errors = 0
|
| 234 |
+
|
| 235 |
+
for identifier, metadata_list in batch_results.items():
|
| 236 |
+
if metadata_list:
|
| 237 |
+
if save_pr_metadata_to_hf(metadata_list, identifier):
|
| 238 |
+
upload_success += 1
|
| 239 |
+
else:
|
| 240 |
+
upload_errors += 1
|
| 241 |
+
|
| 242 |
+
print(f" ✓ Batch {batch_num}/{total_batches} upload complete ({upload_success} agents uploaded, {upload_errors} errors)")
|
| 243 |
+
|
| 244 |
except Exception as e:
|
| 245 |
print(f" ✗ Batch {batch_num}/{total_batches} failed: {str(e)}")
|
| 246 |
print(f" Continuing with remaining batches...")
|
|
|
|
| 1162 |
|
| 1163 |
try:
|
| 1164 |
# Use batched approach for better performance
|
| 1165 |
+
# upload_immediately=True means each batch uploads to HuggingFace right after BigQuery completes
|
| 1166 |
all_metadata = fetch_issue_metadata_batched(
|
| 1167 |
+
client, identifiers, start_date, end_date, batch_size=50, upload_immediately=True
|
| 1168 |
)
|
| 1169 |
+
|
| 1170 |
+
# Calculate summary statistics
|
| 1171 |
+
total_prs = sum(len(metadata_list) for metadata_list in all_metadata.values())
|
| 1172 |
+
agents_with_data = sum(1 for metadata_list in all_metadata.values() if metadata_list)
|
| 1173 |
+
|
| 1174 |
+
print(f"\n{'='*80}")
|
| 1175 |
+
print(f"✅ BigQuery mining and upload complete!")
|
| 1176 |
+
print(f" Total agents: {len(agents)}")
|
| 1177 |
+
print(f" Agents with data: {agents_with_data}")
|
| 1178 |
+
print(f" Total PRs found: {total_prs}")
|
| 1179 |
+
print(f"{'='*80}\n")
|
| 1180 |
+
|
| 1181 |
except Exception as e:
|
| 1182 |
print(f"✗ Error during BigQuery fetch: {str(e)}")
|
| 1183 |
import traceback
|
| 1184 |
traceback.print_exc()
|
| 1185 |
return
|
| 1186 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1187 |
# After mining is complete, save leaderboard and metrics to HuggingFace
|
| 1188 |
print(f"📤 Uploading leaderboard and metrics data...")
|
| 1189 |
if save_leaderboard_and_metrics_to_hf():
|
msr.py
CHANGED
|
@@ -190,7 +190,7 @@ def upload_file_with_backoff(api, **kwargs):
|
|
| 190 |
# BIGQUERY FUNCTIONS
|
| 191 |
# =============================================================================
|
| 192 |
|
| 193 |
-
def fetch_issue_metadata_batched(client, identifiers, start_date, end_date, batch_size=50):
|
| 194 |
"""
|
| 195 |
Fetch issue metadata for ALL agents using BATCHED BigQuery queries.
|
| 196 |
Splits agents into smaller batches to avoid performance issues with large numbers of agents.
|
|
@@ -200,7 +200,8 @@ def fetch_issue_metadata_batched(client, identifiers, start_date, end_date, batc
|
|
| 200 |
identifiers: List of GitHub usernames/bot identifiers
|
| 201 |
start_date: Start datetime (timezone-aware)
|
| 202 |
end_date: End datetime (timezone-aware)
|
| 203 |
-
batch_size: Number of agents to process per batch (default:
|
|
|
|
| 204 |
|
| 205 |
Returns:
|
| 206 |
Dictionary mapping agent identifier to list of issue metadata
|
|
@@ -212,6 +213,10 @@ def fetch_issue_metadata_batched(client, identifiers, start_date, end_date, batc
|
|
| 212 |
print(f"\n🔍 Using BATCHED approach for {len(identifiers)} agents")
|
| 213 |
print(f" Total batches: {total_batches} (batch size: {batch_size})")
|
| 214 |
print(f" Time range: {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 215 |
|
| 216 |
# Collect results from all batches
|
| 217 |
all_metadata = {}
|
|
@@ -234,6 +239,21 @@ def fetch_issue_metadata_batched(client, identifiers, start_date, end_date, batc
|
|
| 234 |
|
| 235 |
print(f" ✓ Batch {batch_num}/{total_batches} complete")
|
| 236 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 237 |
except Exception as e:
|
| 238 |
print(f" ✗ Batch {batch_num}/{total_batches} failed: {str(e)}")
|
| 239 |
print(f" Continuing with remaining batches...")
|
|
@@ -875,68 +895,28 @@ def mine_all_agents():
|
|
| 875 |
|
| 876 |
try:
|
| 877 |
# Use batched approach for better performance
|
|
|
|
| 878 |
all_metadata = fetch_issue_metadata_batched(
|
| 879 |
-
client, identifiers, start_date, end_date, batch_size=50
|
| 880 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 881 |
except Exception as e:
|
| 882 |
print(f"✗ Error during BigQuery fetch: {str(e)}")
|
| 883 |
import traceback
|
| 884 |
traceback.print_exc()
|
| 885 |
return
|
| 886 |
|
| 887 |
-
# Save results for each agent
|
| 888 |
-
print(f"\n{'='*80}")
|
| 889 |
-
print(f"💾 Saving results to HuggingFace for each agent...")
|
| 890 |
-
print(f"{'='*80}\n")
|
| 891 |
-
|
| 892 |
-
success_count = 0
|
| 893 |
-
error_count = 0
|
| 894 |
-
no_data_count = 0
|
| 895 |
-
|
| 896 |
-
for i, agent in enumerate(agents, 1):
|
| 897 |
-
identifier = agent.get('github_identifier')
|
| 898 |
-
agent_name = agent.get('name', 'Unknown')
|
| 899 |
-
|
| 900 |
-
if not identifier:
|
| 901 |
-
print(f"[{i}/{len(agents)}] Skipping agent without identifier")
|
| 902 |
-
error_count += 1
|
| 903 |
-
continue
|
| 904 |
-
|
| 905 |
-
metadata = all_metadata.get(identifier, [])
|
| 906 |
-
|
| 907 |
-
print(f"[{i}/{len(agents)}] {agent_name} ({identifier}):")
|
| 908 |
-
|
| 909 |
-
try:
|
| 910 |
-
if metadata:
|
| 911 |
-
print(f" 💾 Saving {len(metadata)} PR records...")
|
| 912 |
-
if save_pr_metadata_to_hf(metadata, identifier):
|
| 913 |
-
success_count += 1
|
| 914 |
-
else:
|
| 915 |
-
error_count += 1
|
| 916 |
-
else:
|
| 917 |
-
print(f" No PRs found")
|
| 918 |
-
no_data_count += 1
|
| 919 |
-
|
| 920 |
-
except Exception as e:
|
| 921 |
-
print(f" ✗ Error saving {identifier}: {str(e)}")
|
| 922 |
-
import traceback
|
| 923 |
-
traceback.print_exc()
|
| 924 |
-
error_count += 1
|
| 925 |
-
continue
|
| 926 |
-
|
| 927 |
-
# Calculate number of batches
|
| 928 |
-
batch_size = 50
|
| 929 |
-
total_batches = (len(identifiers) + batch_size - 1) // batch_size
|
| 930 |
-
|
| 931 |
-
print(f"\n{'='*80}")
|
| 932 |
-
print(f"✅ Mining complete!")
|
| 933 |
-
print(f" Total agents: {len(agents)}")
|
| 934 |
-
print(f" Successfully saved: {success_count}")
|
| 935 |
-
print(f" No data (skipped): {no_data_count}")
|
| 936 |
-
print(f" Errors: {error_count}")
|
| 937 |
-
print(f" BigQuery batches executed: {total_batches} (batch size: {batch_size})")
|
| 938 |
-
print(f"{'='*80}\n")
|
| 939 |
-
|
| 940 |
# Compute and save leaderboard data
|
| 941 |
print(f"\n{'='*80}")
|
| 942 |
print(f"📊 Computing leaderboard and monthly metrics...")
|
|
|
|
| 190 |
# BIGQUERY FUNCTIONS
|
| 191 |
# =============================================================================
|
| 192 |
|
| 193 |
+
def fetch_issue_metadata_batched(client, identifiers, start_date, end_date, batch_size=50, upload_immediately=True):
|
| 194 |
"""
|
| 195 |
Fetch issue metadata for ALL agents using BATCHED BigQuery queries.
|
| 196 |
Splits agents into smaller batches to avoid performance issues with large numbers of agents.
|
|
|
|
| 200 |
identifiers: List of GitHub usernames/bot identifiers
|
| 201 |
start_date: Start datetime (timezone-aware)
|
| 202 |
end_date: End datetime (timezone-aware)
|
| 203 |
+
batch_size: Number of agents to process per batch (default: 50)
|
| 204 |
+
upload_immediately: If True, upload each batch's results to HuggingFace immediately (default: True)
|
| 205 |
|
| 206 |
Returns:
|
| 207 |
Dictionary mapping agent identifier to list of issue metadata
|
|
|
|
| 213 |
print(f"\n🔍 Using BATCHED approach for {len(identifiers)} agents")
|
| 214 |
print(f" Total batches: {total_batches} (batch size: {batch_size})")
|
| 215 |
print(f" Time range: {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}")
|
| 216 |
+
if upload_immediately:
|
| 217 |
+
print(f" Upload mode: Immediate (after each batch)")
|
| 218 |
+
else:
|
| 219 |
+
print(f" Upload mode: Deferred (all at once)")
|
| 220 |
|
| 221 |
# Collect results from all batches
|
| 222 |
all_metadata = {}
|
|
|
|
| 239 |
|
| 240 |
print(f" ✓ Batch {batch_num}/{total_batches} complete")
|
| 241 |
|
| 242 |
+
# Upload immediately after this batch if enabled
|
| 243 |
+
if upload_immediately and batch_results:
|
| 244 |
+
print(f"\n 📤 Uploading batch {batch_num}/{total_batches} results to HuggingFace...")
|
| 245 |
+
upload_success = 0
|
| 246 |
+
upload_errors = 0
|
| 247 |
+
|
| 248 |
+
for identifier, metadata_list in batch_results.items():
|
| 249 |
+
if metadata_list:
|
| 250 |
+
if save_pr_metadata_to_hf(metadata_list, identifier):
|
| 251 |
+
upload_success += 1
|
| 252 |
+
else:
|
| 253 |
+
upload_errors += 1
|
| 254 |
+
|
| 255 |
+
print(f" ✓ Batch {batch_num}/{total_batches} upload complete ({upload_success} agents uploaded, {upload_errors} errors)")
|
| 256 |
+
|
| 257 |
except Exception as e:
|
| 258 |
print(f" ✗ Batch {batch_num}/{total_batches} failed: {str(e)}")
|
| 259 |
print(f" Continuing with remaining batches...")
|
|
|
|
| 895 |
|
| 896 |
try:
|
| 897 |
# Use batched approach for better performance
|
| 898 |
+
# upload_immediately=True means each batch uploads to HuggingFace right after BigQuery completes
|
| 899 |
all_metadata = fetch_issue_metadata_batched(
|
| 900 |
+
client, identifiers, start_date, end_date, batch_size=50, upload_immediately=True
|
| 901 |
)
|
| 902 |
+
|
| 903 |
+
# Calculate summary statistics
|
| 904 |
+
total_prs = sum(len(metadata_list) for metadata_list in all_metadata.values())
|
| 905 |
+
agents_with_data = sum(1 for metadata_list in all_metadata.values() if metadata_list)
|
| 906 |
+
|
| 907 |
+
print(f"\n{'='*80}")
|
| 908 |
+
print(f"✅ BigQuery mining and upload complete!")
|
| 909 |
+
print(f" Total agents: {len(agents)}")
|
| 910 |
+
print(f" Agents with data: {agents_with_data}")
|
| 911 |
+
print(f" Total PRs found: {total_prs}")
|
| 912 |
+
print(f"{'='*80}\n")
|
| 913 |
+
|
| 914 |
except Exception as e:
|
| 915 |
print(f"✗ Error during BigQuery fetch: {str(e)}")
|
| 916 |
import traceback
|
| 917 |
traceback.print_exc()
|
| 918 |
return
|
| 919 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 920 |
# Compute and save leaderboard data
|
| 921 |
print(f"\n{'='*80}")
|
| 922 |
print(f"📊 Computing leaderboard and monthly metrics...")
|