zhimin-z
commited on
Commit
·
f9d762e
1
Parent(s):
dbe55b8
refine
Browse files
msr.py
CHANGED
|
@@ -397,7 +397,9 @@ def fetch_all_pr_metadata_streaming(conn, identifiers, start_date, end_date):
|
|
| 397 |
file_patterns_sql = '[' + ', '.join([f"'{fp}'" for fp in file_patterns]) + ']'
|
| 398 |
|
| 399 |
# Query for this batch
|
| 400 |
-
#
|
|
|
|
|
|
|
| 401 |
query = f"""
|
| 402 |
SELECT DISTINCT
|
| 403 |
CONCAT(
|
|
@@ -405,6 +407,7 @@ def fetch_all_pr_metadata_streaming(conn, identifiers, start_date, end_date):
|
|
| 405 |
'/pull/',
|
| 406 |
CAST(payload.pull_request.number AS VARCHAR)
|
| 407 |
) as url,
|
|
|
|
| 408 |
TRY_CAST(json_extract_string(to_json(payload), '$.pull_request.user.login') AS VARCHAR) as pr_author,
|
| 409 |
TRY_CAST(json_extract_string(to_json(payload), '$.pull_request.created_at') AS VARCHAR) as created_at,
|
| 410 |
TRY_CAST(json_extract_string(to_json(payload), '$.pull_request.merged_at') AS VARCHAR) as merged_at,
|
|
@@ -422,34 +425,64 @@ def fetch_all_pr_metadata_streaming(conn, identifiers, start_date, end_date):
|
|
| 422 |
AND payload.pull_request.number IS NOT NULL
|
| 423 |
AND TRY_CAST(json_extract_string(to_json(payload), '$.pull_request.created_at') AS VARCHAR) IS NOT NULL
|
| 424 |
AND TRY_CAST(json_extract_string(to_json(payload), '$.pull_request.user.login') AS VARCHAR) IN ({identifier_list})
|
|
|
|
| 425 |
"""
|
| 426 |
|
| 427 |
try:
|
| 428 |
results = conn.execute(query).fetchall()
|
| 429 |
-
|
| 430 |
-
|
| 431 |
-
|
|
|
|
| 432 |
for row in results:
|
| 433 |
url = row[0]
|
| 434 |
-
|
| 435 |
-
|
| 436 |
-
|
| 437 |
-
|
| 438 |
-
|
| 439 |
-
|
|
|
|
| 440 |
continue
|
| 441 |
-
|
| 442 |
-
|
| 443 |
-
'
|
| 444 |
'created_at': created_at,
|
| 445 |
'merged_at': merged_at,
|
| 446 |
'closed_at': closed_at,
|
| 447 |
}
|
| 448 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 449 |
metadata_by_agent[pr_author].append(pr_metadata)
|
| 450 |
batch_prs += 1
|
| 451 |
total_prs += 1
|
| 452 |
-
|
| 453 |
print(f"✓ {batch_prs} PRs found")
|
| 454 |
|
| 455 |
except Exception as e:
|
|
|
|
| 397 |
file_patterns_sql = '[' + ', '.join([f"'{fp}'" for fp in file_patterns]) + ']'
|
| 398 |
|
| 399 |
# Query for this batch
|
| 400 |
+
# We need both opened and closed events:
|
| 401 |
+
# - opened events: to identify PRs created within the time frame
|
| 402 |
+
# - closed events: to determine if/when those PRs were merged
|
| 403 |
query = f"""
|
| 404 |
SELECT DISTINCT
|
| 405 |
CONCAT(
|
|
|
|
| 407 |
'/pull/',
|
| 408 |
CAST(payload.pull_request.number AS VARCHAR)
|
| 409 |
) as url,
|
| 410 |
+
TRY_CAST(json_extract_string(to_json(payload), '$.action') AS VARCHAR) as action,
|
| 411 |
TRY_CAST(json_extract_string(to_json(payload), '$.pull_request.user.login') AS VARCHAR) as pr_author,
|
| 412 |
TRY_CAST(json_extract_string(to_json(payload), '$.pull_request.created_at') AS VARCHAR) as created_at,
|
| 413 |
TRY_CAST(json_extract_string(to_json(payload), '$.pull_request.merged_at') AS VARCHAR) as merged_at,
|
|
|
|
| 425 |
AND payload.pull_request.number IS NOT NULL
|
| 426 |
AND TRY_CAST(json_extract_string(to_json(payload), '$.pull_request.created_at') AS VARCHAR) IS NOT NULL
|
| 427 |
AND TRY_CAST(json_extract_string(to_json(payload), '$.pull_request.user.login') AS VARCHAR) IN ({identifier_list})
|
| 428 |
+
AND TRY_CAST(json_extract_string(to_json(payload), '$.action') AS VARCHAR) IN ('opened', 'closed')
|
| 429 |
"""
|
| 430 |
|
| 431 |
try:
|
| 432 |
results = conn.execute(query).fetchall()
|
| 433 |
+
|
| 434 |
+
# Group events by PR URL to merge opened and closed events
|
| 435 |
+
pr_events = defaultdict(lambda: {'opened': None, 'closed': None})
|
| 436 |
+
|
| 437 |
for row in results:
|
| 438 |
url = row[0]
|
| 439 |
+
action = row[1]
|
| 440 |
+
pr_author = row[2]
|
| 441 |
+
created_at = normalize_date_format(row[3]) if row[3] else None
|
| 442 |
+
merged_at = normalize_date_format(row[4]) if row[4] else None
|
| 443 |
+
closed_at = normalize_date_format(row[5]) if row[5] else None
|
| 444 |
+
|
| 445 |
+
if not url or not action:
|
| 446 |
continue
|
| 447 |
+
|
| 448 |
+
event_data = {
|
| 449 |
+
'pr_author': pr_author,
|
| 450 |
'created_at': created_at,
|
| 451 |
'merged_at': merged_at,
|
| 452 |
'closed_at': closed_at,
|
| 453 |
}
|
| 454 |
+
|
| 455 |
+
pr_events[url][action] = event_data
|
| 456 |
+
|
| 457 |
+
# Only include PRs that have an 'opened' event
|
| 458 |
+
# Use closed event data (if available) to get merged_at and closed_at
|
| 459 |
+
batch_prs = 0
|
| 460 |
+
for url, events in pr_events.items():
|
| 461 |
+
if not events['opened']:
|
| 462 |
+
# Skip PRs without an 'opened' event in this time frame
|
| 463 |
+
continue
|
| 464 |
+
|
| 465 |
+
opened_event = events['opened']
|
| 466 |
+
closed_event = events['closed']
|
| 467 |
+
|
| 468 |
+
pr_author = opened_event['pr_author']
|
| 469 |
+
if not pr_author:
|
| 470 |
+
continue
|
| 471 |
+
|
| 472 |
+
# Build complete PR metadata
|
| 473 |
+
# created_at comes from opened event
|
| 474 |
+
# merged_at and closed_at come from closed event (if available)
|
| 475 |
+
pr_metadata = {
|
| 476 |
+
'html_url': url,
|
| 477 |
+
'created_at': opened_event['created_at'],
|
| 478 |
+
'merged_at': closed_event['merged_at'] if closed_event else None,
|
| 479 |
+
'closed_at': closed_event['closed_at'] if closed_event else None,
|
| 480 |
+
}
|
| 481 |
+
|
| 482 |
metadata_by_agent[pr_author].append(pr_metadata)
|
| 483 |
batch_prs += 1
|
| 484 |
total_prs += 1
|
| 485 |
+
|
| 486 |
print(f"✓ {batch_prs} PRs found")
|
| 487 |
|
| 488 |
except Exception as e:
|