zhimin-z commited on
Commit
2faae44
·
1 Parent(s): 2344e22
Files changed (1) hide show
  1. msr.py +22 -12
msr.py CHANGED
@@ -40,13 +40,8 @@ LEADERBOARD_TIME_FRAME_DAYS = 180
40
  # Git sync configuration (mandatory to get latest bot data)
41
  GIT_SYNC_TIMEOUT = 300 # 5 minutes timeout for git pull
42
 
43
- # OPTIMIZED DUCKDB CONFIGURATION
44
- DUCKDB_THREADS = 16
45
- DUCKDB_MEMORY_LIMIT = "64GB"
46
-
47
  # Streaming batch configuration
48
- BATCH_SIZE_DAYS = 7 # Process 1 week at a time (~168 hourly files)
49
- # At this size: ~7 days × 24 files × ~100MB per file = ~16GB uncompressed per batch
50
 
51
  # Download configuration
52
  DOWNLOAD_WORKERS = 4
@@ -306,13 +301,28 @@ def get_duckdb_connection():
306
  # Re-raise if it's not a locking error
307
  raise
308
 
309
- # OPTIMIZED SETTINGS
310
- conn.execute(f"SET threads TO {DUCKDB_THREADS};")
311
- conn.execute("SET preserve_insertion_order = false;")
312
- conn.execute("SET enable_object_cache = true;")
313
  conn.execute("SET temp_directory = '/tmp/duckdb_temp';")
314
- conn.execute(f"SET memory_limit = '{DUCKDB_MEMORY_LIMIT}';") # Per-query limit
315
- conn.execute(f"SET max_memory = '{DUCKDB_MEMORY_LIMIT}';") # Hard cap
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
316
 
317
  return conn
318
 
 
40
  # Git sync configuration (mandatory to get latest bot data)
41
  GIT_SYNC_TIMEOUT = 300 # 5 minutes timeout for git pull
42
 
 
 
 
 
43
  # Streaming batch configuration
44
+ BATCH_SIZE_DAYS = 1 # Process 1 day at a time (~24 hourly files)
 
45
 
46
  # Download configuration
47
  DOWNLOAD_WORKERS = 4
 
301
  # Re-raise if it's not a locking error
302
  raise
303
 
304
+ # CORE MEMORY & THREADING SETTINGS
305
+ conn.execute(f"SET threads TO 8;")
306
+ conn.execute(f"SET max_memory = '48GB';") # Hard cap
 
307
  conn.execute("SET temp_directory = '/tmp/duckdb_temp';")
308
+
309
+ # JSON STREAMING OPTIMIZATIONS (critical for performance)
310
+ conn.execute("SET json.read_objects = true;") # Enable streaming JSON objects
311
+ conn.execute("SET json.read_buffer_size = '64MB';") # Increase from 256KB default for large fields
312
+ conn.execute("SET json.format = 'newline_delimited';") # Skip array parsing, double throughput
313
+
314
+ # GZIP PARALLEL DECOMPRESSION (only needed for .json.gz files)
315
+ try:
316
+ conn.execute("SET extension_directory = '/tmp/duckdb_ext';")
317
+ conn.execute("INSTALL 'gzip';")
318
+ conn.execute("LOAD 'gzip';")
319
+ except Exception as e:
320
+ print(f" ⚠ Warning: Could not load gzip extension: {e}")
321
+
322
+ # PERFORMANCE OPTIMIZATIONS
323
+ conn.execute("SET preserve_insertion_order = false;") # Disable expensive ordering
324
+ conn.execute("SET default_order = 'ORDER BY NONE';") # Skip unnecessary sorting
325
+ conn.execute("SET enable_object_cache = true;") # Cache repeatedly read files
326
 
327
  return conn
328