zhimin-z commited on
Commit
f9d762e
·
1 Parent(s): dbe55b8
Files changed (1) hide show
  1. msr.py +48 -15
msr.py CHANGED
@@ -397,7 +397,9 @@ def fetch_all_pr_metadata_streaming(conn, identifiers, start_date, end_date):
397
  file_patterns_sql = '[' + ', '.join([f"'{fp}'" for fp in file_patterns]) + ']'
398
 
399
  # Query for this batch
400
- # Note: For PullRequestReviewEvent, we use the actor as author
 
 
401
  query = f"""
402
  SELECT DISTINCT
403
  CONCAT(
@@ -405,6 +407,7 @@ def fetch_all_pr_metadata_streaming(conn, identifiers, start_date, end_date):
405
  '/pull/',
406
  CAST(payload.pull_request.number AS VARCHAR)
407
  ) as url,
 
408
  TRY_CAST(json_extract_string(to_json(payload), '$.pull_request.user.login') AS VARCHAR) as pr_author,
409
  TRY_CAST(json_extract_string(to_json(payload), '$.pull_request.created_at') AS VARCHAR) as created_at,
410
  TRY_CAST(json_extract_string(to_json(payload), '$.pull_request.merged_at') AS VARCHAR) as merged_at,
@@ -422,34 +425,64 @@ def fetch_all_pr_metadata_streaming(conn, identifiers, start_date, end_date):
422
  AND payload.pull_request.number IS NOT NULL
423
  AND TRY_CAST(json_extract_string(to_json(payload), '$.pull_request.created_at') AS VARCHAR) IS NOT NULL
424
  AND TRY_CAST(json_extract_string(to_json(payload), '$.pull_request.user.login') AS VARCHAR) IN ({identifier_list})
 
425
  """
426
 
427
  try:
428
  results = conn.execute(query).fetchall()
429
- batch_prs = 0
430
-
431
- # Add results to accumulating dictionary
 
432
  for row in results:
433
  url = row[0]
434
- pr_author = row[1]
435
- created_at = normalize_date_format(row[2]) if row[2] else None
436
- merged_at = normalize_date_format(row[3]) if row[3] else None
437
- closed_at = normalize_date_format(row[4]) if row[4] else None
438
-
439
- if not url:
 
440
  continue
441
-
442
- pr_metadata = {
443
- 'html_url': url,
444
  'created_at': created_at,
445
  'merged_at': merged_at,
446
  'closed_at': closed_at,
447
  }
448
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
449
  metadata_by_agent[pr_author].append(pr_metadata)
450
  batch_prs += 1
451
  total_prs += 1
452
-
453
  print(f"✓ {batch_prs} PRs found")
454
 
455
  except Exception as e:
 
397
  file_patterns_sql = '[' + ', '.join([f"'{fp}'" for fp in file_patterns]) + ']'
398
 
399
  # Query for this batch
400
+ # We need both opened and closed events:
401
+ # - opened events: to identify PRs created within the time frame
402
+ # - closed events: to determine if/when those PRs were merged
403
  query = f"""
404
  SELECT DISTINCT
405
  CONCAT(
 
407
  '/pull/',
408
  CAST(payload.pull_request.number AS VARCHAR)
409
  ) as url,
410
+ TRY_CAST(json_extract_string(to_json(payload), '$.action') AS VARCHAR) as action,
411
  TRY_CAST(json_extract_string(to_json(payload), '$.pull_request.user.login') AS VARCHAR) as pr_author,
412
  TRY_CAST(json_extract_string(to_json(payload), '$.pull_request.created_at') AS VARCHAR) as created_at,
413
  TRY_CAST(json_extract_string(to_json(payload), '$.pull_request.merged_at') AS VARCHAR) as merged_at,
 
425
  AND payload.pull_request.number IS NOT NULL
426
  AND TRY_CAST(json_extract_string(to_json(payload), '$.pull_request.created_at') AS VARCHAR) IS NOT NULL
427
  AND TRY_CAST(json_extract_string(to_json(payload), '$.pull_request.user.login') AS VARCHAR) IN ({identifier_list})
428
+ AND TRY_CAST(json_extract_string(to_json(payload), '$.action') AS VARCHAR) IN ('opened', 'closed')
429
  """
430
 
431
  try:
432
  results = conn.execute(query).fetchall()
433
+
434
+ # Group events by PR URL to merge opened and closed events
435
+ pr_events = defaultdict(lambda: {'opened': None, 'closed': None})
436
+
437
  for row in results:
438
  url = row[0]
439
+ action = row[1]
440
+ pr_author = row[2]
441
+ created_at = normalize_date_format(row[3]) if row[3] else None
442
+ merged_at = normalize_date_format(row[4]) if row[4] else None
443
+ closed_at = normalize_date_format(row[5]) if row[5] else None
444
+
445
+ if not url or not action:
446
  continue
447
+
448
+ event_data = {
449
+ 'pr_author': pr_author,
450
  'created_at': created_at,
451
  'merged_at': merged_at,
452
  'closed_at': closed_at,
453
  }
454
+
455
+ pr_events[url][action] = event_data
456
+
457
+ # Only include PRs that have an 'opened' event
458
+ # Use closed event data (if available) to get merged_at and closed_at
459
+ batch_prs = 0
460
+ for url, events in pr_events.items():
461
+ if not events['opened']:
462
+ # Skip PRs without an 'opened' event in this time frame
463
+ continue
464
+
465
+ opened_event = events['opened']
466
+ closed_event = events['closed']
467
+
468
+ pr_author = opened_event['pr_author']
469
+ if not pr_author:
470
+ continue
471
+
472
+ # Build complete PR metadata
473
+ # created_at comes from opened event
474
+ # merged_at and closed_at come from closed event (if available)
475
+ pr_metadata = {
476
+ 'html_url': url,
477
+ 'created_at': opened_event['created_at'],
478
+ 'merged_at': closed_event['merged_at'] if closed_event else None,
479
+ 'closed_at': closed_event['closed_at'] if closed_event else None,
480
+ }
481
+
482
  metadata_by_agent[pr_author].append(pr_metadata)
483
  batch_prs += 1
484
  total_prs += 1
485
+
486
  print(f"✓ {batch_prs} PRs found")
487
 
488
  except Exception as e: