Spaces:
Sleeping
Sleeping
zhimin-z
commited on
Commit
·
c3011cc
1
Parent(s):
9278f09
refine
Browse files
msr.py
CHANGED
|
@@ -433,7 +433,11 @@ def fetch_all_pr_metadata_single_query(conn, identifiers, start_date, end_date):
|
|
| 433 |
# Build identifier list for IN clause
|
| 434 |
identifier_list = ', '.join([f"'{id}'" for id in identifiers])
|
| 435 |
|
| 436 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
| 437 |
# Optimized: Single file scan + ROW_NUMBER() deduplication (no DISTINCT)
|
| 438 |
query = f"""
|
| 439 |
WITH all_review_events AS (
|
|
@@ -444,7 +448,7 @@ def fetch_all_pr_metadata_single_query(conn, identifiers, start_date, end_date):
|
|
| 444 |
TRY_CAST(json_extract_string(repo, '$.name') AS VARCHAR) as repo_name,
|
| 445 |
payload,
|
| 446 |
created_at
|
| 447 |
-
FROM read_json(
|
| 448 |
WHERE
|
| 449 |
TRY_CAST(type AS VARCHAR) IN ('PullRequestReviewEvent', 'IssueCommentEvent', 'PullRequestReviewCommentEvent')
|
| 450 |
AND TRY_CAST(json_extract_string(actor, '$.login') AS VARCHAR) IN ({identifier_list})
|
|
@@ -490,9 +494,9 @@ def fetch_all_pr_metadata_single_query(conn, identifiers, start_date, end_date):
|
|
| 490 |
TRY_CAST(json_extract_string(payload, '$.pull_request.closed_at') AS VARCHAR) as closed_at,
|
| 491 |
created_at,
|
| 492 |
ROW_NUMBER() OVER (PARTITION BY json_extract_string(payload, '$.pull_request.html_url') ORDER BY created_at DESC) as rn
|
| 493 |
-
FROM read_json(
|
| 494 |
WHERE
|
| 495 |
-
|
| 496 |
AND TRY_CAST(json_extract_string(payload, '$.action') AS VARCHAR) = 'closed'
|
| 497 |
AND json_extract_string(payload, '$.pull_request.html_url') IS NOT NULL
|
| 498 |
AND json_extract_string(payload, '$.pull_request.html_url') IN (
|
|
@@ -545,8 +549,8 @@ def fetch_all_pr_metadata_single_query(conn, identifiers, start_date, end_date):
|
|
| 545 |
WHERE reviewer IN ({identifier_list})
|
| 546 |
""").fetchall()
|
| 547 |
else:
|
| 548 |
-
# Execute query
|
| 549 |
-
results = conn.execute(query
|
| 550 |
|
| 551 |
# Cache the complete results for all future queries in this date range
|
| 552 |
if len(results) > 0:
|
|
|
|
| 433 |
# Build identifier list for IN clause
|
| 434 |
identifier_list = ', '.join([f"'{id}'" for id in identifiers])
|
| 435 |
|
| 436 |
+
# Convert file patterns to SQL array format for direct interpolation
|
| 437 |
+
review_patterns_sql = str(review_patterns).replace("'", "'")
|
| 438 |
+
status_patterns_sql = str(status_patterns).replace("'", "'")
|
| 439 |
+
|
| 440 |
+
# Build comprehensive query with CTEs using direct SQL array format (JSON.gz format)
|
| 441 |
# Optimized: Single file scan + ROW_NUMBER() deduplication (no DISTINCT)
|
| 442 |
query = f"""
|
| 443 |
WITH all_review_events AS (
|
|
|
|
| 448 |
TRY_CAST(json_extract_string(repo, '$.name') AS VARCHAR) as repo_name,
|
| 449 |
payload,
|
| 450 |
created_at
|
| 451 |
+
FROM read_json({review_patterns_sql}, union_by_name=true, filename=true, compression='gzip', format='newline_delimited', ignore_errors=true, maximum_object_size=2147483648)
|
| 452 |
WHERE
|
| 453 |
TRY_CAST(type AS VARCHAR) IN ('PullRequestReviewEvent', 'IssueCommentEvent', 'PullRequestReviewCommentEvent')
|
| 454 |
AND TRY_CAST(json_extract_string(actor, '$.login') AS VARCHAR) IN ({identifier_list})
|
|
|
|
| 494 |
TRY_CAST(json_extract_string(payload, '$.pull_request.closed_at') AS VARCHAR) as closed_at,
|
| 495 |
created_at,
|
| 496 |
ROW_NUMBER() OVER (PARTITION BY json_extract_string(payload, '$.pull_request.html_url') ORDER BY created_at DESC) as rn
|
| 497 |
+
FROM read_json({status_patterns_sql}, union_by_name=true, filename=true, compression='gzip', format='newline_delimited', ignore_errors=true, maximum_object_size=2147483648)
|
| 498 |
WHERE
|
| 499 |
+
type = 'PullRequestEvent'
|
| 500 |
AND TRY_CAST(json_extract_string(payload, '$.action') AS VARCHAR) = 'closed'
|
| 501 |
AND json_extract_string(payload, '$.pull_request.html_url') IS NOT NULL
|
| 502 |
AND json_extract_string(payload, '$.pull_request.html_url') IN (
|
|
|
|
| 549 |
WHERE reviewer IN ({identifier_list})
|
| 550 |
""").fetchall()
|
| 551 |
else:
|
| 552 |
+
# Execute query using f-string interpolation
|
| 553 |
+
results = conn.execute(query).fetchall()
|
| 554 |
|
| 555 |
# Cache the complete results for all future queries in this date range
|
| 556 |
if len(results) > 0:
|