zhimin-z commited on
Commit
73aa8ef
·
1 Parent(s): a95b059
Files changed (1) hide show
  1. msr.py +27 -12
msr.py CHANGED
@@ -397,38 +397,53 @@ def fetch_all_pr_metadata_streaming(conn, identifiers, start_date, end_date):
397
  file_patterns_sql = '[' + ', '.join([f"'{fp}'" for fp in file_patterns]) + ']'
398
 
399
  # Query for this batch
 
 
400
  query = f"""
401
- WITH pr_events AS (
 
 
 
 
 
 
 
 
 
 
 
 
 
402
  SELECT
403
  CONCAT(
404
  REPLACE(repo.url, 'api.github.com/repos/', 'github.com/'),
405
  '/pull/',
406
  CAST(payload.pull_request.number AS VARCHAR)
407
  ) as url,
408
- payload.pull_request.user.login as pr_author, -- CHANGED
409
- created_at as event_time,
410
  payload.action as event_action,
411
- payload.pull_request.merged_at as merged_at -- CHANGED
412
- FROM read_json({file_patterns_sql}, union_by_name=true, filename=true, compression='gzip', format='newline_delimited', ignore_errors=true, maximum_object_size=2147483648)
413
- WHERE
414
- type = 'PullRequestEvent'
415
- AND payload.action IN ('opened', 'closed')
416
- AND payload.pull_request.number IS NOT NULL
417
- AND payload.pull_request.user.login IN ({identifier_list}) -- CHANGED
 
418
  ),
419
  pr_timeline AS (
420
  SELECT
421
  url,
422
- pr_author,
423
  MIN(CASE WHEN event_action = 'opened' THEN event_time END) as created_at,
424
  MAX(CASE WHEN event_action = 'closed' THEN event_time END) as closed_at,
425
  MAX(merged_at) as merged_at
426
  FROM pr_events
427
- GROUP BY url, pr_author
428
  )
429
  SELECT url, pr_author, created_at, merged_at, closed_at
430
  FROM pr_timeline
431
  WHERE created_at IS NOT NULL
 
432
  """
433
 
434
  try:
 
397
  file_patterns_sql = '[' + ', '.join([f"'{fp}'" for fp in file_patterns]) + ']'
398
 
399
  # Query for this batch
400
+ # Note: GitHub Archive schema varies - older data has full pull_request objects,
401
+ # newer data (Oct 2025+) has stripped-down objects. We use TRY() to handle both.
402
  query = f"""
403
+ WITH raw_events AS (
404
+ SELECT * FROM read_json(
405
+ {file_patterns_sql},
406
+ union_by_name=true,
407
+ filename=true,
408
+ compression='gzip',
409
+ format='newline_delimited',
410
+ ignore_errors=true,
411
+ maximum_object_size=2147483648
412
+ )
413
+ WHERE type = 'PullRequestEvent'
414
+ AND payload.action IN ('opened', 'closed')
415
+ ),
416
+ pr_events AS (
417
  SELECT
418
  CONCAT(
419
  REPLACE(repo.url, 'api.github.com/repos/', 'github.com/'),
420
  '/pull/',
421
  CAST(payload.pull_request.number AS VARCHAR)
422
  ) as url,
 
 
423
  payload.action as event_action,
424
+ CASE
425
+ WHEN payload.action = 'opened' THEN actor.login
426
+ ELSE NULL
427
+ END as pr_author,
428
+ created_at as event_time,
429
+ TRY_CAST(json_extract_string(to_json(payload), '$.pull_request.merged_at') AS VARCHAR) as merged_at
430
+ FROM raw_events
431
+ WHERE payload.pull_request.number IS NOT NULL
432
  ),
433
  pr_timeline AS (
434
  SELECT
435
  url,
436
+ MAX(pr_author) as pr_author,
437
  MIN(CASE WHEN event_action = 'opened' THEN event_time END) as created_at,
438
  MAX(CASE WHEN event_action = 'closed' THEN event_time END) as closed_at,
439
  MAX(merged_at) as merged_at
440
  FROM pr_events
441
+ GROUP BY url
442
  )
443
  SELECT url, pr_author, created_at, merged_at, closed_at
444
  FROM pr_timeline
445
  WHERE created_at IS NOT NULL
446
+ AND pr_author IN ({identifier_list})
447
  """
448
 
449
  try: