zhimin-z commited on
Commit
8212c14
·
1 Parent(s): e5d1adc
Files changed (1) hide show
  1. msr.py +10 -10
msr.py CHANGED
@@ -306,14 +306,7 @@ def get_duckdb_connection():
306
  conn.execute(f"SET threads TO 6;")
307
  conn.execute(f"SET max_memory = '50GB';")
308
  conn.execute("SET temp_directory = '/tmp/duckdb_temp';")
309
-
310
- # GZIP PARALLEL DECOMPRESSION (only needed for .json.gz files)
311
- try:
312
- conn.execute("SET extension_directory = '/tmp/duckdb_ext';")
313
- conn.execute("INSTALL 'gzip';")
314
- conn.execute("LOAD 'gzip';")
315
- except Exception as e:
316
- print(f" ⚠ Warning: Could not load gzip extension: {e}")
317
  # PERFORMANCE OPTIMIZATIONS
318
  conn.execute("SET preserve_insertion_order = false;") # Disable expensive ordering
319
  conn.execute("SET enable_object_cache = true;") # Cache repeatedly read files
@@ -419,7 +412,14 @@ def fetch_all_review_metadata_streaming(conn, identifiers, start_date, end_date)
419
  WHEN type = 'PullRequestReviewCommentEvent' THEN struct_extract(struct_extract(payload.comment, 'user'), 'login')
420
  END as reviewer,
421
  created_at as reviewed_at
422
- FROM read_json({file_patterns_sql}, union_by_name=true, filename=true, compression='gzip', format='newline_delimited', ignore_errors=true, maximum_object_size=2147483648)
 
 
 
 
 
 
 
423
  WHERE
424
  type IN ('PullRequestReviewEvent', 'PullRequestReviewCommentEvent')
425
  AND payload.pull_request.number IS NOT NULL
@@ -442,7 +442,7 @@ def fetch_all_review_metadata_streaming(conn, identifiers, start_date, end_date)
442
  '/pull/',
443
  CAST(payload.pull_request.number AS VARCHAR)
444
  ) ORDER BY created_at DESC) as rn
445
- FROM read_json({file_patterns_sql}, union_by_name=false, filename=true, compression='gzip', format='newline_delimited', ignore_errors=true, maximum_object_size=2147483648)
446
  WHERE
447
  type = 'PullRequestEvent'
448
  AND payload.action = 'closed'
 
306
  conn.execute(f"SET threads TO 6;")
307
  conn.execute(f"SET max_memory = '50GB';")
308
  conn.execute("SET temp_directory = '/tmp/duckdb_temp';")
309
+
 
 
 
 
 
 
 
310
  # PERFORMANCE OPTIMIZATIONS
311
  conn.execute("SET preserve_insertion_order = false;") # Disable expensive ordering
312
  conn.execute("SET enable_object_cache = true;") # Cache repeatedly read files
 
412
  WHEN type = 'PullRequestReviewCommentEvent' THEN struct_extract(struct_extract(payload.comment, 'user'), 'login')
413
  END as reviewer,
414
  created_at as reviewed_at
415
+ FROM read_json(
416
+ {file_patterns_sql},
417
+ union_by_name=true,
418
+ filename=true,
419
+ compression='gzip',
420
+ format='newline_delimited',
421
+ ignore_errors=true
422
+ )
423
  WHERE
424
  type IN ('PullRequestReviewEvent', 'PullRequestReviewCommentEvent')
425
  AND payload.pull_request.number IS NOT NULL
 
442
  '/pull/',
443
  CAST(payload.pull_request.number AS VARCHAR)
444
  ) ORDER BY created_at DESC) as rn
445
+ FROM read_json({file_patterns_sql}, union_by_name=false, filename=true, compression='gzip', format='newline_delimited', ignore_errors=true)
446
  WHERE
447
  type = 'PullRequestEvent'
448
  AND payload.action = 'closed'