zhimin-z
commited on
Commit
·
73aa8ef
1
Parent(s):
a95b059
refine
Browse files
msr.py
CHANGED
|
@@ -397,38 +397,53 @@ def fetch_all_pr_metadata_streaming(conn, identifiers, start_date, end_date):
|
|
| 397 |
file_patterns_sql = '[' + ', '.join([f"'{fp}'" for fp in file_patterns]) + ']'
|
| 398 |
|
| 399 |
# Query for this batch
|
|
|
|
|
|
|
| 400 |
query = f"""
|
| 401 |
-
WITH
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 402 |
SELECT
|
| 403 |
CONCAT(
|
| 404 |
REPLACE(repo.url, 'api.github.com/repos/', 'github.com/'),
|
| 405 |
'/pull/',
|
| 406 |
CAST(payload.pull_request.number AS VARCHAR)
|
| 407 |
) as url,
|
| 408 |
-
payload.pull_request.user.login as pr_author, -- CHANGED
|
| 409 |
-
created_at as event_time,
|
| 410 |
payload.action as event_action,
|
| 411 |
-
|
| 412 |
-
|
| 413 |
-
|
| 414 |
-
|
| 415 |
-
|
| 416 |
-
|
| 417 |
-
|
|
|
|
| 418 |
),
|
| 419 |
pr_timeline AS (
|
| 420 |
SELECT
|
| 421 |
url,
|
| 422 |
-
pr_author,
|
| 423 |
MIN(CASE WHEN event_action = 'opened' THEN event_time END) as created_at,
|
| 424 |
MAX(CASE WHEN event_action = 'closed' THEN event_time END) as closed_at,
|
| 425 |
MAX(merged_at) as merged_at
|
| 426 |
FROM pr_events
|
| 427 |
-
GROUP BY url
|
| 428 |
)
|
| 429 |
SELECT url, pr_author, created_at, merged_at, closed_at
|
| 430 |
FROM pr_timeline
|
| 431 |
WHERE created_at IS NOT NULL
|
|
|
|
| 432 |
"""
|
| 433 |
|
| 434 |
try:
|
|
|
|
| 397 |
file_patterns_sql = '[' + ', '.join([f"'{fp}'" for fp in file_patterns]) + ']'
|
| 398 |
|
| 399 |
# Query for this batch
|
| 400 |
+
# Note: GitHub Archive schema varies - older data has full pull_request objects,
|
| 401 |
+
# newer data (Oct 2025+) has stripped-down objects. We use TRY() to handle both.
|
| 402 |
query = f"""
|
| 403 |
+
WITH raw_events AS (
|
| 404 |
+
SELECT * FROM read_json(
|
| 405 |
+
{file_patterns_sql},
|
| 406 |
+
union_by_name=true,
|
| 407 |
+
filename=true,
|
| 408 |
+
compression='gzip',
|
| 409 |
+
format='newline_delimited',
|
| 410 |
+
ignore_errors=true,
|
| 411 |
+
maximum_object_size=2147483648
|
| 412 |
+
)
|
| 413 |
+
WHERE type = 'PullRequestEvent'
|
| 414 |
+
AND payload.action IN ('opened', 'closed')
|
| 415 |
+
),
|
| 416 |
+
pr_events AS (
|
| 417 |
SELECT
|
| 418 |
CONCAT(
|
| 419 |
REPLACE(repo.url, 'api.github.com/repos/', 'github.com/'),
|
| 420 |
'/pull/',
|
| 421 |
CAST(payload.pull_request.number AS VARCHAR)
|
| 422 |
) as url,
|
|
|
|
|
|
|
| 423 |
payload.action as event_action,
|
| 424 |
+
CASE
|
| 425 |
+
WHEN payload.action = 'opened' THEN actor.login
|
| 426 |
+
ELSE NULL
|
| 427 |
+
END as pr_author,
|
| 428 |
+
created_at as event_time,
|
| 429 |
+
TRY_CAST(json_extract_string(to_json(payload), '$.pull_request.merged_at') AS VARCHAR) as merged_at
|
| 430 |
+
FROM raw_events
|
| 431 |
+
WHERE payload.pull_request.number IS NOT NULL
|
| 432 |
),
|
| 433 |
pr_timeline AS (
|
| 434 |
SELECT
|
| 435 |
url,
|
| 436 |
+
MAX(pr_author) as pr_author,
|
| 437 |
MIN(CASE WHEN event_action = 'opened' THEN event_time END) as created_at,
|
| 438 |
MAX(CASE WHEN event_action = 'closed' THEN event_time END) as closed_at,
|
| 439 |
MAX(merged_at) as merged_at
|
| 440 |
FROM pr_events
|
| 441 |
+
GROUP BY url
|
| 442 |
)
|
| 443 |
SELECT url, pr_author, created_at, merged_at, closed_at
|
| 444 |
FROM pr_timeline
|
| 445 |
WHERE created_at IS NOT NULL
|
| 446 |
+
AND pr_author IN ({identifier_list})
|
| 447 |
"""
|
| 448 |
|
| 449 |
try:
|