Spaces:

SWE-Arena
/

SWE-PR

Sleeping

App Files Files Community

zhimin-z commited on Nov 15, 2025

Commit

3f0eb80

1 Parent(s): b85e4dd

refine

Browse files

Files changed (1) hide show

msr.py +77 -87

msr.py CHANGED Viewed

@@ -86,17 +86,23 @@ def save_jsonl(filename, data):
 def normalize_date_format(date_string):
     """
-    Convert date strings to standardized ISO 8601 format with Z suffix.
     Handles both 'T' and space-separated datetime formats (including newlines).
     Examples:
     - 2025-10-15T23:23:47.983068 -> 2025-10-15T23:23:47Z
     - 2025-06-17 21:21:07+00 -> 2025-06-17T21:21:07Z
     """
     if not date_string or date_string == 'N/A':
         return 'N/A'
     try:
         import re
         # Remove all whitespace (spaces, newlines, tabs) and replace with single space
         date_string = re.sub(r'\s+', ' ', date_string.strip())
@@ -431,97 +437,73 @@ def fetch_all_pr_metadata_single_query(conn, identifiers, start_date, end_date):
     # Generate file path patterns for the time range
     file_patterns = generate_file_path_patterns(start_date, end_date)
-    # Build identifier list for IN clause
     identifier_list = ', '.join([f"'{id}'" for id in identifiers])
-    # Build file patterns list for SQL (as a JSON array string)
     file_patterns_sql = '[' + ', '.join([f"'{fp}'" for fp in file_patterns]) + ']'
-    # Build comprehensive query with CTEs using file lists (JSON.gz format)
     query = f"""
     WITH pr_events AS (
-      -- Get all PR events (opened, closed) for all agents
-      SELECT
-        TRY_CAST(json_extract_string(payload, '$.pull_request.html_url') AS VARCHAR) as url,
-        TRY_CAST(json_extract_string(actor, '$.login') AS VARCHAR) as pr_author,
-        TRY_CAST(json_extract_string(payload, '$.pull_request.created_at') AS VARCHAR) as created_at,
-        TRY_CAST(json_extract_string(payload, '$.pull_request.merged') AS BOOLEAN) as is_merged,
-        TRY_CAST(json_extract_string(payload, '$.pull_request.merged_at') AS VARCHAR) as merged_at,
-        TRY_CAST(json_extract_string(payload, '$.pull_request.closed_at') AS VARCHAR) as closed_at,
-        TRY_CAST(json_extract_string(payload, '$.action') AS VARCHAR) as action,
-        created_at as event_time
-      FROM read_json({file_patterns_sql}, union_by_name=true, filename=true, compression='gzip', format='newline_delimited', ignore_errors=true, maximum_object_size=2147483648)
-      WHERE
         type = 'PullRequestEvent'
-        AND json_extract_string(payload, '$.pull_request.html_url') IS NOT NULL
-        AND TRY_CAST(json_extract_string(payload, '$.action') AS VARCHAR) = 'opened'
-        AND TRY_CAST(json_extract_string(actor, '$.login') AS VARCHAR) IN ({identifier_list})
     ),
-    pr_latest_state AS (
-      -- Get the latest state for each PR (most recent event)
-      SELECT
         url,
         pr_author,
-        created_at,
-        merged_at,
-        closed_at,
-        ROW_NUMBER() OVER (PARTITION BY url ORDER BY event_time DESC) as row_num
-      FROM pr_events
     )
-    -- Return deduplicated PR metadata (row_num = 1 already ensures uniqueness)
     SELECT
-      url,
-      pr_author,
-      created_at,
-      merged_at,
-      closed_at
-    FROM pr_latest_state
-    WHERE row_num = 1
     ORDER BY created_at DESC
-    """
     try:
-        # Create cache table name based on date range
-        cache_table_name = f"pr_cache_{start_date.strftime('%Y%m%d')}_{end_date.strftime('%Y%m%d')}"
-        # Check if cache exists and is valid
-        cache_exists = conn.execute(f"""
-            SELECT COUNT(*) FROM information_schema.tables
-            WHERE table_name = '{cache_table_name}'
-        """).fetchone()[0] > 0
-        if cache_exists:
-            results = conn.execute(f"""
-                SELECT url, pr_author, created_at, merged_at, closed_at
-                FROM {cache_table_name}
-                WHERE pr_author IN ({identifier_list})
-            """).fetchall()
-        else:
-            # Execute query
-            results = conn.execute(query).fetchall()
-            # Cache the complete results for all future queries in this date range
-            if len(results) > 0:
-                conn.execute(f"""
-                    CREATE TABLE {cache_table_name} AS
-                    SELECT * FROM (
-                        SELECT UNNEST($1) as url, UNNEST($2) as pr_author,
-                               UNNEST($3) as created_at, UNNEST($4) as merged_at,
-                               UNNEST($5) as closed_at
-                    )
-                """, [
-                    [r[0] for r in results],
-                    [r[1] for r in results],
-                    [r[2] for r in results],
-                    [r[3] for r in results],
-                    [r[4] for r in results]
-                ])
-        # Group results by agent
         metadata_by_agent = defaultdict(list)
-        unique_urls = set()  # Track unique PR URLs for verification
         for row in results:
             url = row[0]
@@ -530,28 +512,36 @@ def fetch_all_pr_metadata_single_query(conn, identifiers, start_date, end_date):
             merged_at = normalize_date_format(row[3]) if row[3] else None
             closed_at = normalize_date_format(row[4]) if row[4] else None
-            # Track unique URLs
             unique_urls.add(url)
-            metadata_by_agent[pr_author].append({
                 'html_url': url,
                 'created_at': created_at,
                 'merged_at': merged_at,
                 'closed_at': closed_at,
-            })
-        # Verification: Ensure we have unique PRs (no duplicates)
-        total_prs = sum(len(prs) for prs in metadata_by_agent.values())
-        if total_prs != len(unique_urls):
-            print(f"   Warning: Duplicate PRs detected! Total: {total_prs}, Unique: {len(unique_urls)}")
-        else:
-            print(f"   Verification passed: {len(unique_urls)} unique PRs retrieved")
-        # Convert defaultdict to regular dict
         return dict(metadata_by_agent)
     except Exception as e:
-        print(f"DuckDB error: {str(e)}")
         import traceback
         traceback.print_exc()
         return {}

 def normalize_date_format(date_string):
     """
+    Convert date strings or datetime objects to standardized ISO 8601 format with Z suffix.
     Handles both 'T' and space-separated datetime formats (including newlines).
     Examples:
     - 2025-10-15T23:23:47.983068 -> 2025-10-15T23:23:47Z
     - 2025-06-17 21:21:07+00 -> 2025-06-17T21:21:07Z
+    - datetime object -> 2025-10-15T23:23:47Z
     """
     if not date_string or date_string == 'N/A':
         return 'N/A'
     try:
         import re
+        # Handle datetime objects directly
+        if isinstance(date_string, datetime):
+            return date_string.strftime('%Y-%m-%dT%H:%M:%SZ')
         # Remove all whitespace (spaces, newlines, tabs) and replace with single space
         date_string = re.sub(r'\s+', ' ', date_string.strip())
     # Generate file path patterns for the time range
     file_patterns = generate_file_path_patterns(start_date, end_date)
+    if not file_patterns:
+        print("   ✗ Error: No GHArchive data files found for the specified date range")
+        return {}
+    # Build identifier list for IN clause with proper escaping
     identifier_list = ', '.join([f"'{id}'" for id in identifiers])
+    # Build file patterns list for SQL (as JSON array string)
     file_patterns_sql = '[' + ', '.join([f"'{fp}'" for fp in file_patterns]) + ']'
+    # ============================================================================
+    # REFINED DUCKDB QUERY - Using struct accessors for parsed JSON
+    # ============================================================================
     query = f"""
     WITH pr_events AS (
+    -- Get all PR opened/closed events
+    SELECT
+        CONCAT(
+        REPLACE(repo.url, 'api.github.com/repos/', 'github.com/'),
+        '/pull/',
+        CAST(payload.pull_request.number AS VARCHAR)
+        ) as url,
+        actor.login as pr_author,
+        created_at as event_time,
+        payload.action as event_action
+    FROM read_json({file_patterns_sql}, union_by_name=true, filename=true, compression='gzip', format='newline_delimited', ignore_errors=true, maximum_object_size=2147483648)
+    WHERE
         type = 'PullRequestEvent'
+        AND payload.action IN ('opened', 'closed')
+        AND payload.pull_request.number IS NOT NULL
+        AND actor.login IN ({identifier_list})
     ),
+    pr_timeline AS (
+    -- Build timeline: opened_at and closed_at (closed could mean merged or rejected)
+    SELECT
         url,
         pr_author,
+        MIN(CASE WHEN event_action = 'opened' THEN event_time END) as created_at,
+        MAX(CASE WHEN event_action = 'closed' THEN event_time END) as closed_at,
+        -- Note: GHArchive doesn't distinguish merged vs closed, so merged_at = NULL
+        NULL as merged_at
+    FROM pr_events
+    GROUP BY url, pr_author
     )
     SELECT
+    url,
+    pr_author,
+    created_at,
+    merged_at,
+    closed_at
+    FROM pr_timeline
+    WHERE created_at IS NOT NULL
     ORDER BY created_at DESC
+    """
     try:
+        # Execute the query
+        results = conn.execute(query).fetchall()
+        if not results:
+            print(f"   ⚠ Warning: Query returned 0 results")
+            print(f"   Checked {len(identifiers)} agent(s): {', '.join(identifiers)}")
+            return {}
+        # Group results by agent identifier
         metadata_by_agent = defaultdict(list)
+        unique_urls = set()
         for row in results:
             url = row[0]
             merged_at = normalize_date_format(row[3]) if row[3] else None
             closed_at = normalize_date_format(row[4]) if row[4] else None
+            # Skip if no valid URL
+            if not url:
+                continue
+            # Track unique URLs for verification
             unique_urls.add(url)
+            # Build metadata record
+            pr_metadata = {
                 'html_url': url,
                 'created_at': created_at,
                 'merged_at': merged_at,
                 'closed_at': closed_at,
+            }
+            metadata_by_agent[pr_author].append(pr_metadata)
+        # Log results per agent
+        agents_with_data = sum(1 for prs in metadata_by_agent.values() if prs)
+        print(f"   ✓ Coverage: {agents_with_data}/{len(identifiers)} agents have PR data")
+        for agent_id in sorted(metadata_by_agent.keys()):
+            pr_count = len(metadata_by_agent[agent_id])
+            print(f"      - {agent_id}: {pr_count} PRs")
+        # Convert defaultdict to regular dict before returning
         return dict(metadata_by_agent)
     except Exception as e:
+        print(f"   ✗ DuckDB query error: {str(e)}")
         import traceback
         traceback.print_exc()
         return {}