Spaces:

SWE-Arena
/

SWE-PR

Running

App Files Files Community

zhiminy commited on Nov 9

Commit

5d479fd

1 Parent(s): b784bb3

refine workflow

Browse files

Files changed (2) hide show

app.py +35 -55
msr.py +36 -56

app.py CHANGED Viewed

@@ -177,7 +177,7 @@ def upload_file_with_backoff(api, **kwargs):
 # BIGQUERY FUNCTIONS
 # =============================================================================
-def fetch_issue_metadata_batched(client, identifiers, start_date, end_date, batch_size=50):
     """
     Fetch issue metadata for ALL agents using BATCHED BigQuery queries.
     Splits agents into smaller batches to avoid performance issues with large numbers of agents.
@@ -188,6 +188,7 @@ def fetch_issue_metadata_batched(client, identifiers, start_date, end_date, batc
         start_date: Start datetime (timezone-aware)
         end_date: End datetime (timezone-aware)
         batch_size: Number of agents to process per batch (default: 50)
     Returns:
         Dictionary mapping agent identifier to list of issue metadata
@@ -199,6 +200,10 @@ def fetch_issue_metadata_batched(client, identifiers, start_date, end_date, batc
     print(f"\n🔍 Using BATCHED approach for {len(identifiers)} agents")
     print(f"   Total batches: {total_batches} (batch size: {batch_size})")
     print(f"   Time range: {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}")
     # Collect results from all batches
     all_metadata = {}
@@ -221,6 +226,21 @@ def fetch_issue_metadata_batched(client, identifiers, start_date, end_date, batc
             print(f"   ✓ Batch {batch_num}/{total_batches} complete")
         except Exception as e:
             print(f"   ✗ Batch {batch_num}/{total_batches} failed: {str(e)}")
             print(f"   Continuing with remaining batches...")
@@ -1142,68 +1162,28 @@ def mine_all_agents():
     try:
         # Use batched approach for better performance
         all_metadata = fetch_issue_metadata_batched(
-            client, identifiers, start_date, end_date, batch_size=50
         )
     except Exception as e:
         print(f"✗ Error during BigQuery fetch: {str(e)}")
         import traceback
         traceback.print_exc()
         return
-    # Save results for each agent
-    print(f"\n{'='*80}")
-    print(f"💾 Saving results to HuggingFace for each agent...")
-    print(f"{'='*80}\n")
-    success_count = 0
-    error_count = 0
-    no_data_count = 0
-    for i, agent in enumerate(agents, 1):
-        identifier = agent.get('github_identifier')
-        agent_name = agent.get('name', 'Unknown')
-        if not identifier:
-            print(f"[{i}/{len(agents)}] Skipping agent without identifier")
-            error_count += 1
-            continue
-        metadata = all_metadata.get(identifier, [])
-        print(f"[{i}/{len(agents)}] {agent_name} ({identifier}):")
-        try:
-            if metadata:
-                print(f"   💾 Saving {len(metadata)} PR records...")
-                if save_pr_metadata_to_hf(metadata, identifier):
-                    success_count += 1
-                else:
-                    error_count += 1
-            else:
-                print(f"   No PRs found")
-                no_data_count += 1
-        except Exception as e:
-            print(f"   ✗ Error saving {identifier}: {str(e)}")
-            import traceback
-            traceback.print_exc()
-            error_count += 1
-            continue
-    # Calculate number of batches
-    batch_size = 50
-    total_batches = (len(identifiers) + batch_size - 1) // batch_size
-    print(f"\n{'='*80}")
-    print(f"✅ Mining complete!")
-    print(f"   Total agents: {len(agents)}")
-    print(f"   Successfully saved: {success_count}")
-    print(f"   No data (skipped): {no_data_count}")
-    print(f"   Errors: {error_count}")
-    print(f"   BigQuery batches executed: {total_batches} (batch size: {batch_size})")
-    print(f"{'='*80}\n")
     # After mining is complete, save leaderboard and metrics to HuggingFace
     print(f"📤 Uploading leaderboard and metrics data...")
     if save_leaderboard_and_metrics_to_hf():

 # BIGQUERY FUNCTIONS
 # =============================================================================
+def fetch_issue_metadata_batched(client, identifiers, start_date, end_date, batch_size=50, upload_immediately=True):
     """
     Fetch issue metadata for ALL agents using BATCHED BigQuery queries.
     Splits agents into smaller batches to avoid performance issues with large numbers of agents.
         start_date: Start datetime (timezone-aware)
         end_date: End datetime (timezone-aware)
         batch_size: Number of agents to process per batch (default: 50)
+        upload_immediately: If True, upload each batch's results to HuggingFace immediately (default: True)
     Returns:
         Dictionary mapping agent identifier to list of issue metadata
     print(f"\n🔍 Using BATCHED approach for {len(identifiers)} agents")
     print(f"   Total batches: {total_batches} (batch size: {batch_size})")
     print(f"   Time range: {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}")
+    if upload_immediately:
+        print(f"   Upload mode: Immediate (after each batch)")
+    else:
+        print(f"   Upload mode: Deferred (all at once)")
     # Collect results from all batches
     all_metadata = {}
             print(f"   ✓ Batch {batch_num}/{total_batches} complete")
+            # Upload immediately after this batch if enabled
+            if upload_immediately and batch_results:
+                print(f"\n   📤 Uploading batch {batch_num}/{total_batches} results to HuggingFace...")
+                upload_success = 0
+                upload_errors = 0
+                for identifier, metadata_list in batch_results.items():
+                    if metadata_list:
+                        if save_pr_metadata_to_hf(metadata_list, identifier):
+                            upload_success += 1
+                        else:
+                            upload_errors += 1
+                print(f"   ✓ Batch {batch_num}/{total_batches} upload complete ({upload_success} agents uploaded, {upload_errors} errors)")
         except Exception as e:
             print(f"   ✗ Batch {batch_num}/{total_batches} failed: {str(e)}")
             print(f"   Continuing with remaining batches...")
     try:
         # Use batched approach for better performance
+        # upload_immediately=True means each batch uploads to HuggingFace right after BigQuery completes
         all_metadata = fetch_issue_metadata_batched(
+            client, identifiers, start_date, end_date, batch_size=50, upload_immediately=True
         )
+        # Calculate summary statistics
+        total_prs = sum(len(metadata_list) for metadata_list in all_metadata.values())
+        agents_with_data = sum(1 for metadata_list in all_metadata.values() if metadata_list)
+        print(f"\n{'='*80}")
+        print(f"✅ BigQuery mining and upload complete!")
+        print(f"   Total agents: {len(agents)}")
+        print(f"   Agents with data: {agents_with_data}")
+        print(f"   Total PRs found: {total_prs}")
+        print(f"{'='*80}\n")
     except Exception as e:
         print(f"✗ Error during BigQuery fetch: {str(e)}")
         import traceback
         traceback.print_exc()
         return
     # After mining is complete, save leaderboard and metrics to HuggingFace
     print(f"📤 Uploading leaderboard and metrics data...")
     if save_leaderboard_and_metrics_to_hf():

msr.py CHANGED Viewed

@@ -190,7 +190,7 @@ def upload_file_with_backoff(api, **kwargs):
 # BIGQUERY FUNCTIONS
 # =============================================================================
-def fetch_issue_metadata_batched(client, identifiers, start_date, end_date, batch_size=50):
     """
     Fetch issue metadata for ALL agents using BATCHED BigQuery queries.
     Splits agents into smaller batches to avoid performance issues with large numbers of agents.
@@ -200,7 +200,8 @@ def fetch_issue_metadata_batched(client, identifiers, start_date, end_date, batc
         identifiers: List of GitHub usernames/bot identifiers
         start_date: Start datetime (timezone-aware)
         end_date: End datetime (timezone-aware)
-        batch_size: Number of agents to process per batch (default: 100)
     Returns:
         Dictionary mapping agent identifier to list of issue metadata
@@ -212,6 +213,10 @@ def fetch_issue_metadata_batched(client, identifiers, start_date, end_date, batc
     print(f"\n🔍 Using BATCHED approach for {len(identifiers)} agents")
     print(f"   Total batches: {total_batches} (batch size: {batch_size})")
     print(f"   Time range: {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}")
     # Collect results from all batches
     all_metadata = {}
@@ -234,6 +239,21 @@ def fetch_issue_metadata_batched(client, identifiers, start_date, end_date, batc
             print(f"   ✓ Batch {batch_num}/{total_batches} complete")
         except Exception as e:
             print(f"   ✗ Batch {batch_num}/{total_batches} failed: {str(e)}")
             print(f"   Continuing with remaining batches...")
@@ -875,68 +895,28 @@ def mine_all_agents():
     try:
         # Use batched approach for better performance
         all_metadata = fetch_issue_metadata_batched(
-            client, identifiers, start_date, end_date, batch_size=50
         )
     except Exception as e:
         print(f"✗ Error during BigQuery fetch: {str(e)}")
         import traceback
         traceback.print_exc()
         return
-    # Save results for each agent
-    print(f"\n{'='*80}")
-    print(f"💾 Saving results to HuggingFace for each agent...")
-    print(f"{'='*80}\n")
-    success_count = 0
-    error_count = 0
-    no_data_count = 0
-    for i, agent in enumerate(agents, 1):
-        identifier = agent.get('github_identifier')
-        agent_name = agent.get('name', 'Unknown')
-        if not identifier:
-            print(f"[{i}/{len(agents)}] Skipping agent without identifier")
-            error_count += 1
-            continue
-        metadata = all_metadata.get(identifier, [])
-        print(f"[{i}/{len(agents)}] {agent_name} ({identifier}):")
-        try:
-            if metadata:
-                print(f"   💾 Saving {len(metadata)} PR records...")
-                if save_pr_metadata_to_hf(metadata, identifier):
-                    success_count += 1
-                else:
-                    error_count += 1
-            else:
-                print(f"   No PRs found")
-                no_data_count += 1
-        except Exception as e:
-            print(f"   ✗ Error saving {identifier}: {str(e)}")
-            import traceback
-            traceback.print_exc()
-            error_count += 1
-            continue
-    # Calculate number of batches
-    batch_size = 50
-    total_batches = (len(identifiers) + batch_size - 1) // batch_size
-    print(f"\n{'='*80}")
-    print(f"✅ Mining complete!")
-    print(f"   Total agents: {len(agents)}")
-    print(f"   Successfully saved: {success_count}")
-    print(f"   No data (skipped): {no_data_count}")
-    print(f"   Errors: {error_count}")
-    print(f"   BigQuery batches executed: {total_batches} (batch size: {batch_size})")
-    print(f"{'='*80}\n")
     # Compute and save leaderboard data
     print(f"\n{'='*80}")
     print(f"📊 Computing leaderboard and monthly metrics...")

 # BIGQUERY FUNCTIONS
 # =============================================================================
+def fetch_issue_metadata_batched(client, identifiers, start_date, end_date, batch_size=50, upload_immediately=True):
     """
     Fetch issue metadata for ALL agents using BATCHED BigQuery queries.
     Splits agents into smaller batches to avoid performance issues with large numbers of agents.
         identifiers: List of GitHub usernames/bot identifiers
         start_date: Start datetime (timezone-aware)
         end_date: End datetime (timezone-aware)
+        batch_size: Number of agents to process per batch (default: 50)
+        upload_immediately: If True, upload each batch's results to HuggingFace immediately (default: True)
     Returns:
         Dictionary mapping agent identifier to list of issue metadata
     print(f"\n🔍 Using BATCHED approach for {len(identifiers)} agents")
     print(f"   Total batches: {total_batches} (batch size: {batch_size})")
     print(f"   Time range: {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}")
+    if upload_immediately:
+        print(f"   Upload mode: Immediate (after each batch)")
+    else:
+        print(f"   Upload mode: Deferred (all at once)")
     # Collect results from all batches
     all_metadata = {}
             print(f"   ✓ Batch {batch_num}/{total_batches} complete")
+            # Upload immediately after this batch if enabled
+            if upload_immediately and batch_results:
+                print(f"\n   📤 Uploading batch {batch_num}/{total_batches} results to HuggingFace...")
+                upload_success = 0
+                upload_errors = 0
+                for identifier, metadata_list in batch_results.items():
+                    if metadata_list:
+                        if save_pr_metadata_to_hf(metadata_list, identifier):
+                            upload_success += 1
+                        else:
+                            upload_errors += 1
+                print(f"   ✓ Batch {batch_num}/{total_batches} upload complete ({upload_success} agents uploaded, {upload_errors} errors)")
         except Exception as e:
             print(f"   ✗ Batch {batch_num}/{total_batches} failed: {str(e)}")
             print(f"   Continuing with remaining batches...")
     try:
         # Use batched approach for better performance
+        # upload_immediately=True means each batch uploads to HuggingFace right after BigQuery completes
         all_metadata = fetch_issue_metadata_batched(
+            client, identifiers, start_date, end_date, batch_size=50, upload_immediately=True
         )
+        # Calculate summary statistics
+        total_prs = sum(len(metadata_list) for metadata_list in all_metadata.values())
+        agents_with_data = sum(1 for metadata_list in all_metadata.values() if metadata_list)
+        print(f"\n{'='*80}")
+        print(f"✅ BigQuery mining and upload complete!")
+        print(f"   Total agents: {len(agents)}")
+        print(f"   Agents with data: {agents_with_data}")
+        print(f"   Total PRs found: {total_prs}")
+        print(f"{'='*80}\n")
     except Exception as e:
         print(f"✗ Error during BigQuery fetch: {str(e)}")
         import traceback
         traceback.print_exc()
         return
     # Compute and save leaderboard data
     print(f"\n{'='*80}")
     print(f"📊 Computing leaderboard and monthly metrics...")