zhimin-z
commited on
Commit
·
2faae44
1
Parent(s):
2344e22
fix
Browse files
msr.py
CHANGED
|
@@ -40,13 +40,8 @@ LEADERBOARD_TIME_FRAME_DAYS = 180
|
|
| 40 |
# Git sync configuration (mandatory to get latest bot data)
|
| 41 |
GIT_SYNC_TIMEOUT = 300 # 5 minutes timeout for git pull
|
| 42 |
|
| 43 |
-
# OPTIMIZED DUCKDB CONFIGURATION
|
| 44 |
-
DUCKDB_THREADS = 16
|
| 45 |
-
DUCKDB_MEMORY_LIMIT = "64GB"
|
| 46 |
-
|
| 47 |
# Streaming batch configuration
|
| 48 |
-
BATCH_SIZE_DAYS =
|
| 49 |
-
# At this size: ~7 days × 24 files × ~100MB per file = ~16GB uncompressed per batch
|
| 50 |
|
| 51 |
# Download configuration
|
| 52 |
DOWNLOAD_WORKERS = 4
|
|
@@ -306,13 +301,28 @@ def get_duckdb_connection():
|
|
| 306 |
# Re-raise if it's not a locking error
|
| 307 |
raise
|
| 308 |
|
| 309 |
-
#
|
| 310 |
-
conn.execute(f"SET threads TO
|
| 311 |
-
conn.execute("SET
|
| 312 |
-
conn.execute("SET enable_object_cache = true;")
|
| 313 |
conn.execute("SET temp_directory = '/tmp/duckdb_temp';")
|
| 314 |
-
|
| 315 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 316 |
|
| 317 |
return conn
|
| 318 |
|
|
|
|
| 40 |
# Git sync configuration (mandatory to get latest bot data)
|
| 41 |
GIT_SYNC_TIMEOUT = 300 # 5 minutes timeout for git pull
|
| 42 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
# Streaming batch configuration
|
| 44 |
+
BATCH_SIZE_DAYS = 1 # Process 1 day at a time (~24 hourly files)
|
|
|
|
| 45 |
|
| 46 |
# Download configuration
|
| 47 |
DOWNLOAD_WORKERS = 4
|
|
|
|
| 301 |
# Re-raise if it's not a locking error
|
| 302 |
raise
|
| 303 |
|
| 304 |
+
# CORE MEMORY & THREADING SETTINGS
|
| 305 |
+
conn.execute(f"SET threads TO 8;")
|
| 306 |
+
conn.execute(f"SET max_memory = '48GB';") # Hard cap
|
|
|
|
| 307 |
conn.execute("SET temp_directory = '/tmp/duckdb_temp';")
|
| 308 |
+
|
| 309 |
+
# JSON STREAMING OPTIMIZATIONS (critical for performance)
|
| 310 |
+
conn.execute("SET json.read_objects = true;") # Enable streaming JSON objects
|
| 311 |
+
conn.execute("SET json.read_buffer_size = '64MB';") # Increase from 256KB default for large fields
|
| 312 |
+
conn.execute("SET json.format = 'newline_delimited';") # Skip array parsing, double throughput
|
| 313 |
+
|
| 314 |
+
# GZIP PARALLEL DECOMPRESSION (only needed for .json.gz files)
|
| 315 |
+
try:
|
| 316 |
+
conn.execute("SET extension_directory = '/tmp/duckdb_ext';")
|
| 317 |
+
conn.execute("INSTALL 'gzip';")
|
| 318 |
+
conn.execute("LOAD 'gzip';")
|
| 319 |
+
except Exception as e:
|
| 320 |
+
print(f" ⚠ Warning: Could not load gzip extension: {e}")
|
| 321 |
+
|
| 322 |
+
# PERFORMANCE OPTIMIZATIONS
|
| 323 |
+
conn.execute("SET preserve_insertion_order = false;") # Disable expensive ordering
|
| 324 |
+
conn.execute("SET default_order = 'ORDER BY NONE';") # Skip unnecessary sorting
|
| 325 |
+
conn.execute("SET enable_object_cache = true;") # Cache repeatedly read files
|
| 326 |
|
| 327 |
return conn
|
| 328 |
|