Spaces:
Sleeping
Sleeping
zhimin-z
commited on
Commit
·
8212c14
1
Parent(s):
e5d1adc
fix
Browse files
msr.py
CHANGED
|
@@ -306,14 +306,7 @@ def get_duckdb_connection():
|
|
| 306 |
conn.execute(f"SET threads TO 6;")
|
| 307 |
conn.execute(f"SET max_memory = '50GB';")
|
| 308 |
conn.execute("SET temp_directory = '/tmp/duckdb_temp';")
|
| 309 |
-
|
| 310 |
-
# GZIP PARALLEL DECOMPRESSION (only needed for .json.gz files)
|
| 311 |
-
try:
|
| 312 |
-
conn.execute("SET extension_directory = '/tmp/duckdb_ext';")
|
| 313 |
-
conn.execute("INSTALL 'gzip';")
|
| 314 |
-
conn.execute("LOAD 'gzip';")
|
| 315 |
-
except Exception as e:
|
| 316 |
-
print(f" ⚠ Warning: Could not load gzip extension: {e}")
|
| 317 |
# PERFORMANCE OPTIMIZATIONS
|
| 318 |
conn.execute("SET preserve_insertion_order = false;") # Disable expensive ordering
|
| 319 |
conn.execute("SET enable_object_cache = true;") # Cache repeatedly read files
|
|
@@ -419,7 +412,14 @@ def fetch_all_review_metadata_streaming(conn, identifiers, start_date, end_date)
|
|
| 419 |
WHEN type = 'PullRequestReviewCommentEvent' THEN struct_extract(struct_extract(payload.comment, 'user'), 'login')
|
| 420 |
END as reviewer,
|
| 421 |
created_at as reviewed_at
|
| 422 |
-
FROM read_json(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 423 |
WHERE
|
| 424 |
type IN ('PullRequestReviewEvent', 'PullRequestReviewCommentEvent')
|
| 425 |
AND payload.pull_request.number IS NOT NULL
|
|
@@ -442,7 +442,7 @@ def fetch_all_review_metadata_streaming(conn, identifiers, start_date, end_date)
|
|
| 442 |
'/pull/',
|
| 443 |
CAST(payload.pull_request.number AS VARCHAR)
|
| 444 |
) ORDER BY created_at DESC) as rn
|
| 445 |
-
FROM read_json({file_patterns_sql}, union_by_name=false, filename=true, compression='gzip', format='newline_delimited', ignore_errors=true
|
| 446 |
WHERE
|
| 447 |
type = 'PullRequestEvent'
|
| 448 |
AND payload.action = 'closed'
|
|
|
|
| 306 |
conn.execute(f"SET threads TO 6;")
|
| 307 |
conn.execute(f"SET max_memory = '50GB';")
|
| 308 |
conn.execute("SET temp_directory = '/tmp/duckdb_temp';")
|
| 309 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 310 |
# PERFORMANCE OPTIMIZATIONS
|
| 311 |
conn.execute("SET preserve_insertion_order = false;") # Disable expensive ordering
|
| 312 |
conn.execute("SET enable_object_cache = true;") # Cache repeatedly read files
|
|
|
|
| 412 |
WHEN type = 'PullRequestReviewCommentEvent' THEN struct_extract(struct_extract(payload.comment, 'user'), 'login')
|
| 413 |
END as reviewer,
|
| 414 |
created_at as reviewed_at
|
| 415 |
+
FROM read_json(
|
| 416 |
+
{file_patterns_sql},
|
| 417 |
+
union_by_name=true,
|
| 418 |
+
filename=true,
|
| 419 |
+
compression='gzip',
|
| 420 |
+
format='newline_delimited',
|
| 421 |
+
ignore_errors=true
|
| 422 |
+
)
|
| 423 |
WHERE
|
| 424 |
type IN ('PullRequestReviewEvent', 'PullRequestReviewCommentEvent')
|
| 425 |
AND payload.pull_request.number IS NOT NULL
|
|
|
|
| 442 |
'/pull/',
|
| 443 |
CAST(payload.pull_request.number AS VARCHAR)
|
| 444 |
) ORDER BY created_at DESC) as rn
|
| 445 |
+
FROM read_json({file_patterns_sql}, union_by_name=false, filename=true, compression='gzip', format='newline_delimited', ignore_errors=true)
|
| 446 |
WHERE
|
| 447 |
type = 'PullRequestEvent'
|
| 448 |
AND payload.action = 'closed'
|