Spaces:

RUC-AIBOX
/

OlymMATH-demo

Running

App Files Files Community

CoderBak commited on Jun 1

Commit

3923791

verified ·

1 Parent(s): e5b2622

Update app.py

Browse files

Files changed (1) hide show

app.py +720 -45

app.py CHANGED Viewed

@@ -22,6 +22,14 @@ SUBJECT_TRANS = {
     "组合": "Combinatorics"
 }
 MODEL_TRANS = {
     "acemath-rl-nemotron-7b": "AceMath-RL-Nemotron-7B",
     "deepseek-r1-distill-qwen-1.5b": "DeepSeek-R1-Distill-Qwen-1.5B",
@@ -65,6 +73,70 @@ DATASETS = ["EN-HARD", "EN-EASY", "ZH-HARD", "ZH-EASY"]
 # 全局数据库实例
 db = None
 class ModelDatabase:
     """Database access class"""
@@ -360,6 +432,82 @@ class ModelDatabase:
         # 清理所有缓存
         self.clear_cache()
 def format_latex(text):
     if text is None: return ""
     # Process the text for proper LaTeX rendering with KaTeX
@@ -372,12 +520,24 @@ def format_latex(text):
 def format_markdown_with_math(text):
     if text is None: return ""
-    # Don't add HTML tags or do special processing for LaTeX - let Gradio handle it
-    # Just clean up basic issues that might affect rendering
     # Convert newlines for markdown
     text = text.replace('\r\n', '\n').replace('\r', '\n')
     # Return the cleaned text for Gradio's markdown component to render
     return text
@@ -584,16 +744,9 @@ def handle_comparison_problem_update(problem_id, dataset_state):
         # Use format_markdown_with_math for proper rendering
         problem_content = format_markdown_with_math(problem_dict.get('problem', ''))
-        # 将答案中的双美元符号替换为单美元符号
         answer_text = problem_dict.get('answer', '')
-        # 先将$$...$$替换为单个$...$，使用re.DOTALL处理多行
-        answer_text = re.sub(r'\$\$(.*?)\$\$', r'$\1$', answer_text, flags=re.DOTALL)
-        # 检查答案是否已经包含美元符号，如果没有则添加
-        if '$' not in answer_text and answer_text.strip():
-            answer_text = f"${answer_text}$"
-        answer_content = format_markdown_with_math(answer_text)
         return problem_content, answer_content
     except Exception as e:
@@ -634,16 +787,9 @@ def handle_problem_select(problem_id_from_js, current_model_state, current_datas
             # Process problem and answer text for Markdown rendering
             problem_content = format_markdown_with_math(problem_dict.get('problem', ''))
-            # 将答案中的双美元符号替换为单美元符号
             answer_text = problem_dict.get('answer', '')
-            # 先将$$...$$替换为单个$...$，使用re.DOTALL处理多行
-            answer_text = re.sub(r'\$\$(.*?)\$\$', r'$\1$', answer_text, flags=re.DOTALL)
-            # 检查答案是否已经包含美元符号，如果没有则添加
-            if '$' not in answer_text and answer_text.strip():
-                answer_text = f"${answer_text}$"
-            answer_content = format_markdown_with_math(answer_text)
             # For comparison without model, we don't have samples to display
             return problem_content, answer_content, "", gr.State([])
@@ -673,16 +819,9 @@ def handle_problem_select(problem_id_from_js, current_model_state, current_datas
     # Process problem and answer text for Markdown rendering
     problem_content = format_markdown_with_math(problem_dict.get('problem', ''))
-    # 将答案中的双美元符号替换为单美元符号
     answer_text = problem_dict.get('answer', '')
-    # 先将$$...$$替换为单个$...$，使用re.DOTALL处理多行
-    answer_text = re.sub(r'\$\$(.*?)\$\$', r'$\1$', answer_text, flags=re.DOTALL)
-    # 检查答案是否已经包含美元符号，如果没有则添加
-    if '$' not in answer_text and answer_text.strip():
-        answer_text = f"${answer_text}$"
-    answer_content = format_markdown_with_math(answer_text)
     # Rest of the function remains the same
     if not responses_data:
@@ -709,7 +848,7 @@ def handle_problem_select(problem_id_from_js, current_model_state, current_datas
         samples_per_row = 16 if mode == 'comparison' else 32
         # 第一行: 样本 0-samples_per_row
-        samples_grid_html = f'<div style="display: grid; grid-template-columns: repeat({samples_per_row}, 1fr); gap: 2px; margin-bottom: 4px; overflow-x: auto;">'
         for i, resp in enumerate(displayed_samples[:samples_per_row]):
             correctness = resp.get('correctness', 0)
@@ -737,7 +876,7 @@ def handle_problem_select(problem_id_from_js, current_model_state, current_datas
         # 如果有更多样本，显示第二行
         if actual_display_count > samples_per_row:
             row_samples = displayed_samples[samples_per_row:2*samples_per_row]
-            samples_grid_html += f'<div style="display: grid; grid-template-columns: repeat({samples_per_row}, 1fr); gap: 2px; margin-bottom: 4px; overflow-x: auto;">'
             for i, resp in enumerate(row_samples):
                 actual_idx = i + samples_per_row
@@ -767,7 +906,7 @@ def handle_problem_select(problem_id_from_js, current_model_state, current_datas
             # 第三行
             row_samples = displayed_samples[2*samples_per_row:3*samples_per_row]
             if row_samples:
-                samples_grid_html += f'<div style="display: grid; grid-template-columns: repeat({samples_per_row}, 1fr); gap: 2px; margin-bottom: 4px; overflow-x: auto;">'
                 for i, resp in enumerate(row_samples):
                     actual_idx = i + 2*samples_per_row
@@ -796,7 +935,7 @@ def handle_problem_select(problem_id_from_js, current_model_state, current_datas
             if actual_display_count > 3*samples_per_row:
                 row_samples = displayed_samples[3*samples_per_row:4*samples_per_row]
                 if row_samples:
-                    samples_grid_html += f'<div style="display: grid; grid-template-columns: repeat({samples_per_row}, 1fr); gap: 2px; margin-bottom: 4px; overflow-x: auto;">'
                     for i, resp in enumerate(row_samples):
                         actual_idx = i + 3*samples_per_row
@@ -886,6 +1025,54 @@ def create_ui(db_path):
     global db
     db = ModelDatabase(db_path)
     AVAILABLE_DATASETS = db.get_available_datasets()
     if not AVAILABLE_DATASETS:
         AVAILABLE_DATASETS = ["EN-HARD", "EN-EASY", "ZH-HARD", "ZH-EASY"] # Fallback
@@ -896,9 +1083,9 @@ def create_ui(db_path):
     body, .gradio-container { font-family: sans-serif; font-size: 0.95em; line-height: 1.6; }
     .sample-btn { transition: all 0.15s ease-in-out; }
     .sample-btn:hover { transform: translateY(-1px); box-shadow: 0 2px 5px rgba(0,0,0,0.1); }
-    .problem-grid-container { overflow-y: auto; }
-    .math-content { overflow-x: auto; padding: 5px; }
-    .sample-response { overflow-y: clip !important; max-height: none !important; height: auto !important; }
     h1, h2, h3, h4, h5 { margin-top: 0.8em; margin-bottom: 0.4em; color: var(--color-text); }
     .gradio-tabs > div[role='tablist'] button { font-size: 0.9em; padding: 8px 12px; }
     .gr-dropdown select { font-size: 0.9em; }
@@ -964,6 +1151,68 @@ def create_ui(db_path):
         border: 1px solid #ddd;
         padding: 4px 8px;
     }
     """
     with gr.Blocks(css=custom_css, theme=gr.themes.Soft(primary_hue=gr.themes.colors.blue, secondary_hue=gr.themes.colors.sky)) as demo:
@@ -989,6 +1238,64 @@ def create_ui(db_path):
         # 创建占位符State组件替代None
         dummy_state = gr.State(value=None)
         with gr.Tabs():
             with gr.TabItem("Single Model Analysis"):
                 with gr.Row(variant='compact'):
@@ -1228,6 +1535,83 @@ def create_ui(db_path):
                             ]
                         )
         # --- Event Handlers ---
         def update_available_models_for_dropdowns(selected_dataset):
             # This function can be used to update model lists if they are dataset-dependent
@@ -1549,6 +1933,37 @@ def create_ui(db_path):
             outputs=[sample_number_input]
         )
         return demo
 def monitor_memory_usage():
@@ -1575,6 +1990,273 @@ def monitor_memory_usage():
     except Exception as e:
         return "Memory monitor error"
 # 修改主函数以使用优化策略
 if __name__ == "__main__":
     DB_PATH = "data.db"
@@ -1582,22 +2264,15 @@ if __name__ == "__main__":
     # 检查数据库文件是否存在，如果不存在则从 Hugging Face 下载
     if not os.path.exists(DB_PATH):
         try:
-            # 从环境变量获取 HF_TOKEN
-            hf_token = os.environ.get("HF_TOKEN")
-            if not hf_token:
-                raise ValueError("HF_TOKEN environment variable is not set")
-            # 从 Hugging Face 下载数据库文件
             DB_PATH = hf_hub_download(
                 repo_id="CoderBak/OlymMATH-data",
                 filename="data.db",
-                repo_type="dataset",
-                token=hf_token
             )
         except Exception as e:
             # 创建一个显示错误信息的简单 Gradio 应用
             with gr.Blocks() as error_demo:
-                gr.Markdown(f"# Error: Database Download Failed\n{str(e)}\nPlease ensure HF_TOKEN is set correctly and try again.")
             error_demo.launch(server_name="0.0.0.0")
             exit(1)

     "组合": "Combinatorics"
 }
+# 英文到中文的翻译表
+SUBJECT_TRANS_EN_TO_ZH = {
+    "Algebra": "代数",
+    "Number Theory": "数论",
+    "Geometry": "几何",
+    "Combinatorics": "组合"
+}
 MODEL_TRANS = {
     "acemath-rl-nemotron-7b": "AceMath-RL-Nemotron-7B",
     "deepseek-r1-distill-qwen-1.5b": "DeepSeek-R1-Distill-Qwen-1.5B",
 # 全局数据库实例
 db = None
+# 全局缓存for Reference Solutions
+reference_accuracy_cache = {}
+def precompute_reference_accuracies(db, reference_loader):
+    """Pre-compute all reference problem accuracies for fast loading"""
+    global reference_accuracy_cache
+    if not db or not reference_loader:
+        return
+    print("Pre-computing reference problem accuracies...")
+    start_time = time.time()
+    problem_ids = reference_loader.get_all_problem_ids()
+    reference_accuracy_cache = {}
+    # 获取所有模型一次性
+    all_models = db.get_available_models()
+    print(f"Computing accuracies for {len(problem_ids)} problems across {len(all_models)} models...")
+    for i, pid in enumerate(problem_ids):
+        if i % 5 == 0:  # 每5个问题打印一次进度
+            print(f"Processing problem {i+1}/{len(problem_ids)}: {pid}")
+        try:
+            en_unique_id = f"OlymMATH-HARD-{pid}-EN"
+            zh_unique_id = f"OlymMATH-HARD-{pid}-ZH"
+            en_accuracies = []
+            zh_accuracies = []
+            for model in all_models:
+                # 英文版本
+                try:
+                    _, responses_en = db.get_problem_data(model, "EN-HARD", en_unique_id)
+                    if responses_en and len(responses_en) > 0:
+                        avg_accuracy_en = sum(r['correctness'] for r in responses_en) / len(responses_en)
+                        en_accuracies.append(avg_accuracy_en)
+                except Exception:
+                    pass
+                # 中文版本
+                try:
+                    _, responses_zh = db.get_problem_data(model, "ZH-HARD", zh_unique_id)
+                    if responses_zh and len(responses_zh) > 0:
+                        avg_accuracy_zh = sum(r['correctness'] for r in responses_zh) / len(responses_zh)
+                        zh_accuracies.append(avg_accuracy_zh)
+                except Exception:
+                    pass
+            # 计算平均值并存储到缓存
+            en_avg = sum(en_accuracies) / len(en_accuracies) if en_accuracies else 0.0
+            zh_avg = sum(zh_accuracies) / len(zh_accuracies) if zh_accuracies else 0.0
+            reference_accuracy_cache[pid] = {"EN": en_avg, "ZH": zh_avg}
+        except Exception as e:
+            print(f"Error computing accuracy for problem {pid}: {e}")
+            reference_accuracy_cache[pid] = {"EN": 0.0, "ZH": 0.0}
+    elapsed_time = time.time() - start_time
+    print(f"✅ Pre-computation completed in {elapsed_time:.2f} seconds")
+    print(f"✅ Cached accuracies for {len(reference_accuracy_cache)} problems")
 class ModelDatabase:
     """Database access class"""
         # 清理所有缓存
         self.clear_cache()
+class ReferenceDataLoader:
+    """Load and manage reference solutions data"""
+    def __init__(self, jsonl_path):
+        self.jsonl_path = jsonl_path
+        self.reference_data = {}
+        self._load_data()
+    def _load_data(self):
+        """Load data from extra.jsonl"""
+        try:
+            with open(self.jsonl_path, 'r', encoding='utf-8') as f:
+                for line in f:
+                    data = json.loads(line.strip())
+                    unique_id = data['unique_id']
+                    self.reference_data[unique_id] = data
+        except Exception as e:
+            print(f"Error loading reference data: {e}")
+    def get_problem_data(self, unique_id):
+        """Get reference data for a specific problem ID"""
+        return self.reference_data.get(unique_id)
+    def get_all_problem_ids(self):
+        """Get all available problem IDs"""
+        return sorted(self.reference_data.keys())
+def calculate_reference_problem_accuracy(db, unique_id):
+    """Calculate average accuracy for a reference problem across all models for both EN and ZH versions"""
+    try:
+        # 构建英文和中文版本的unique_id
+        en_unique_id = f"OlymMATH-HARD-{unique_id}-EN"
+        zh_unique_id = f"OlymMATH-HARD-{unique_id}-ZH"
+        print(f"Calculating accuracy for problem {unique_id}: EN={en_unique_id}, ZH={zh_unique_id}")
+        accuracies = {"EN": [], "ZH": []}
+        # 获取所有模型
+        all_models = db.get_available_models()
+        print(f"Found {len(all_models)} models in database")
+        for model in all_models:
+            # 英文版本
+            try:
+                _, responses_en = db.get_problem_data(model, "EN-HARD", en_unique_id)
+                if responses_en and len(responses_en) > 0:
+                    avg_accuracy_en = sum(r['correctness'] for r in responses_en) / len(responses_en)
+                    accuracies["EN"].append(avg_accuracy_en)
+                    print(f"  Model {model} EN: {avg_accuracy_en:.2%} ({len(responses_en)} responses)")
+            except Exception as e:
+                print(f"  Error getting EN data for model {model}: {e}")
+                pass
+            # 中文版本
+            try:
+                _, responses_zh = db.get_problem_data(model, "ZH-HARD", zh_unique_id)
+                if responses_zh and len(responses_zh) > 0:
+                    avg_accuracy_zh = sum(r['correctness'] for r in responses_zh) / len(responses_zh)
+                    accuracies["ZH"].append(avg_accuracy_zh)
+                    print(f"  Model {model} ZH: {avg_accuracy_zh:.2%} ({len(responses_zh)} responses)")
+            except Exception as e:
+                print(f"  Error getting ZH data for model {model}: {e}")
+                pass
+        # 计算平均值
+        en_avg = sum(accuracies["EN"]) / len(accuracies["EN"]) if accuracies["EN"] else 0.0
+        zh_avg = sum(accuracies["ZH"]) / len(accuracies["ZH"]) if accuracies["ZH"] else 0.0
+        print(f"Final averages for problem {unique_id}: EN={en_avg:.2%} (from {len(accuracies['EN'])} models), ZH={zh_avg:.2%} (from {len(accuracies['ZH'])} models)")
+        return en_avg, zh_avg
+    except Exception as e:
+        print(f"Error calculating accuracy for problem {unique_id}: {e}")
+        return 0.0, 0.0
 def format_latex(text):
     if text is None: return ""
     # Process the text for proper LaTeX rendering with KaTeX
 def format_markdown_with_math(text):
     if text is None: return ""
+    # Convert LaTeX delimiters first - same logic as format_solution_latex
+    # Convert $$xxx$$ to \[xxx\] (display math)
+    text = re.sub(r'\$\$(.*?)\$\$', r'\\[\1\\]', text, flags=re.DOTALL)
+    # Convert $xxx$ to \(xxx\) (inline math)
+    # Be careful not to match already converted \[...\] content
+    text = re.sub(r'(?<!\\)\$([^$\n]+?)\$(?!\])', r'\\(\1\\)', text)
     # Convert newlines for markdown
     text = text.replace('\r\n', '\n').replace('\r', '\n')
+    # Clean up excessive newlines
+    text = re.sub(r'\n\s*\n\s*\n+', '\n\n', text)
+    # Debug: Print if aligned environment detected
+    if '\\begin{aligned}' in text:
+        print(f"LaTeX aligned environment detected in text (first 200 chars): {text[:200]}...")
     # Return the cleaned text for Gradio's markdown component to render
     return text
         # Use format_markdown_with_math for proper rendering
         problem_content = format_markdown_with_math(problem_dict.get('problem', ''))
+        # Use special answer formatting
         answer_text = problem_dict.get('answer', '')
+        answer_content = format_answer_with_math(answer_text)
         return problem_content, answer_content
     except Exception as e:
             # Process problem and answer text for Markdown rendering
             problem_content = format_markdown_with_math(problem_dict.get('problem', ''))
+            # Use special answer formatting
             answer_text = problem_dict.get('answer', '')
+            answer_content = format_answer_with_math(answer_text)
             # For comparison without model, we don't have samples to display
             return problem_content, answer_content, "", gr.State([])
     # Process problem and answer text for Markdown rendering
     problem_content = format_markdown_with_math(problem_dict.get('problem', ''))
+    # Use special answer formatting
     answer_text = problem_dict.get('answer', '')
+    answer_content = format_answer_with_math(answer_text)
     # Rest of the function remains the same
     if not responses_data:
         samples_per_row = 16 if mode == 'comparison' else 32
         # 第一行: 样本 0-samples_per_row
+        samples_grid_html = f'<div style="display: grid; grid-template-columns: repeat({samples_per_row}, 1fr); gap: 2px; margin-bottom: 4px;">'
         for i, resp in enumerate(displayed_samples[:samples_per_row]):
             correctness = resp.get('correctness', 0)
         # 如果有更多样本，显示第二行
         if actual_display_count > samples_per_row:
             row_samples = displayed_samples[samples_per_row:2*samples_per_row]
+            samples_grid_html += f'<div style="display: grid; grid-template-columns: repeat({samples_per_row}, 1fr); gap: 2px; margin-bottom: 4px;">'
             for i, resp in enumerate(row_samples):
                 actual_idx = i + samples_per_row
             # 第三行
             row_samples = displayed_samples[2*samples_per_row:3*samples_per_row]
             if row_samples:
+                samples_grid_html += f'<div style="display: grid; grid-template-columns: repeat({samples_per_row}, 1fr); gap: 2px; margin-bottom: 4px;">'
                 for i, resp in enumerate(row_samples):
                     actual_idx = i + 2*samples_per_row
             if actual_display_count > 3*samples_per_row:
                 row_samples = displayed_samples[3*samples_per_row:4*samples_per_row]
                 if row_samples:
+                    samples_grid_html += f'<div style="display: grid; grid-template-columns: repeat({samples_per_row}, 1fr); gap: 2px; margin-bottom: 4px;">'
                     for i, resp in enumerate(row_samples):
                         actual_idx = i + 3*samples_per_row
     global db
     db = ModelDatabase(db_path)
+    # Initialize reference data loader with better path handling
+    reference_loader = None
+    # Try multiple possible paths for extra.jsonl
+    possible_paths = [
+        os.path.join(os.path.dirname(db_path), "extra.jsonl"),
+        os.path.join(os.getcwd(), "extra.jsonl"),
+        "extra.jsonl"
+    ]
+    for extra_jsonl_path in possible_paths:
+        if os.path.exists(extra_jsonl_path):
+            try:
+                reference_loader = ReferenceDataLoader(extra_jsonl_path)
+                print(f"Successfully loaded reference data from: {extra_jsonl_path}")
+                break
+            except Exception as e:
+                print(f"Error loading reference data from {extra_jsonl_path}: {e}")
+                continue
+    # If not found locally, try to download from Hugging Face
+    if not reference_loader:
+        try:
+            print("Attempting to download extra.jsonl from Hugging Face...")
+            extra_jsonl_path = hf_hub_download(
+                repo_id="CoderBak/OlymMATH-data",
+                filename="extra.jsonl",
+                repo_type="dataset"
+            )
+            reference_loader = ReferenceDataLoader(extra_jsonl_path)
+            print(f"Successfully downloaded and loaded reference data from: {extra_jsonl_path}")
+        except Exception as e:
+            print(f"Failed to download extra.jsonl from Hugging Face: {e}")
+    if not reference_loader:
+        print("Warning: extra.jsonl not found in any of the expected locations:")
+        for path in possible_paths:
+            print(f"  - {path}")
+        print("Reference Solutions tab will not be available.")
+    else:
+        # Test the reference data availability
+        test_reference_data_availability(db, reference_loader)
+        # Pre-compute reference problem accuracies for fast loading
+        precompute_reference_accuracies(db, reference_loader)
+        # Test LaTeX formatting
+        test_latex_formatting()
     AVAILABLE_DATASETS = db.get_available_datasets()
     if not AVAILABLE_DATASETS:
         AVAILABLE_DATASETS = ["EN-HARD", "EN-EASY", "ZH-HARD", "ZH-EASY"] # Fallback
     body, .gradio-container { font-family: sans-serif; font-size: 0.95em; line-height: 1.6; }
     .sample-btn { transition: all 0.15s ease-in-out; }
     .sample-btn:hover { transform: translateY(-1px); box-shadow: 0 2px 5px rgba(0,0,0,0.1); }
+    .problem-grid-container { overflow: visible !important; }
+    .math-content { overflow: visible !important; padding: 5px; }
+    .sample-response { overflow: visible !important; max-height: none !important; height: auto !important; }
     h1, h2, h3, h4, h5 { margin-top: 0.8em; margin-bottom: 0.4em; color: var(--color-text); }
     .gradio-tabs > div[role='tablist'] button { font-size: 0.9em; padding: 8px 12px; }
     .gr-dropdown select { font-size: 0.9em; }
         border: 1px solid #ddd;
         padding: 4px 8px;
     }
+    /* 隐藏滚动条但保留功能 */
+    ::-webkit-scrollbar {
+        display: none !important;
+        width: 0px !important;
+        height: 0px !important;
+    }
+    /* 主容器禁用滚动 */
+    .gradio-container {
+        overflow-x: hidden !important;
+    }
+    /* Gradio组件容器 */
+    .gradio-row, .gradio-column {
+        overflow: visible !important;
+        max-height: none !important;
+    }
+    /* HTML组件 */
+    .gr-html {
+        overflow: visible !important;
+        max-height: none !important;
+    }
+    /* Markdown组件保持可见 */
+    .gr-markdown {
+        overflow: visible !important;
+        max-height: none !important;
+    }
+    /* 特定的问题网格容器 */
+    #ref-problem-grid-container, #problem-grid-container, #comp-problem-grid-container-left, #comp-problem-grid-container-right {
+        overflow: visible !important;
+        max-height: none !important;
+        height: auto !important;
+    }
+    /* 样本网格 */
+    .sample-grid-btn {
+        overflow: visible !important;
+    }
+    /* 确保内容区域不会产生滚动条 */
+    .gr-form, .gr-box {
+        overflow: visible !important;
+        max-height: none !important;
+    }
+    /* Reference Solutions - 禁止Solution部分的滚动 */
+    #ref-solution {
+        overflow: hidden !important;
+        max-height: none !important;
+        height: auto !important;
+    }
+    /* 确保Solution内容容器也禁止滚动 */
+    #ref-solution .gr-markdown {
+        overflow: hidden !important;
+        max-height: none !important;
+        height: auto !important;
+    }
     """
     with gr.Blocks(css=custom_css, theme=gr.themes.Soft(primary_hue=gr.themes.colors.blue, secondary_hue=gr.themes.colors.sky)) as demo:
         # 创建占位符State组件替代None
         dummy_state = gr.State(value=None)
+        # Add JavaScript for handling problem grid clicks
+        demo.load(lambda: None, js="""
+        () => {
+            // Handle problem button clicks for single model tab
+            function setupProblemGridListeners() {
+                document.addEventListener('click', function(e) {
+                    if (e.target.closest('.problem-btn')) {
+                        const problemBtn = e.target.closest('.problem-btn');
+                        const problemId = problemBtn.getAttribute('data-problem-id');
+                        if (problemId) {
+                            const problemInput = document.getElementById('problem-state-input');
+                            if (problemInput) {
+                                problemInput.querySelector('input').value = problemId;
+                                problemInput.querySelector('input').dispatchEvent(new Event('input', {bubbles: true}));
+                            }
+                        }
+                    }
+                    // Handle comparison problem button clicks
+                    if (e.target.closest('#comp-problem-grid-container-left .problem-btn') ||
+                        e.target.closest('#comp-problem-grid-container-right .problem-btn')) {
+                        const problemBtn = e.target.closest('.problem-btn');
+                        const problemId = problemBtn.getAttribute('data-problem-id');
+                        if (problemId) {
+                            const problemInput = document.getElementById('comp-problem-state-input');
+                            if (problemInput) {
+                                problemInput.querySelector('input').value = problemId;
+                                problemInput.querySelector('input').dispatchEvent(new Event('input', {bubbles: true}));
+                            }
+                        }
+                    }
+                    // Handle reference problem button clicks
+                    if (e.target.closest('#ref-problem-grid-container .ref-problem-btn')) {
+                        const problemBtn = e.target.closest('.ref-problem-btn');
+                        const problemId = problemBtn.getAttribute('data-problem-id');
+                        if (problemId) {
+                            const problemInput = document.getElementById('ref-problem-state-input');
+                            if (problemInput) {
+                                problemInput.querySelector('input').value = problemId;
+                                problemInput.querySelector('input').dispatchEvent(new Event('input', {bubbles: true}));
+                            }
+                        }
+                    }
+                });
+            }
+            // Set up listeners initially and after any DOM changes
+            setupProblemGridListeners();
+            // Re-setup listeners whenever the DOM changes (for dynamic content)
+            const observer = new MutationObserver(function(mutations) {
+                setupProblemGridListeners();
+            });
+            observer.observe(document.body, {childList: true, subtree: true});
+        }
+        """)
         with gr.Tabs():
             with gr.TabItem("Single Model Analysis"):
                 with gr.Row(variant='compact'):
                             ]
                         )
+            with gr.TabItem("Reference Solutions"):
+                with gr.Row(variant='compact'):
+                    with gr.Column(scale=1, min_width=280):
+                        ref_problem_state_input = gr.Textbox(
+                            value="",
+                            elem_id="ref-problem-state-input",
+                            visible=True,
+                            label="Enter Problem ID",
+                            container=True,
+                            interactive=True,
+                            every=0.5
+                        )
+                    with gr.Column(scale=3, min_width=400):
+                        gr.Markdown("#### Problem Grid (OlymMATH-HARD: All models avg. acc. - Top: EN, Bottom: ZH)")
+                        ref_problem_grid_html_output = gr.HTML(
+                            value="<div>Loading reference data...</div>",
+                            elem_id="ref-problem-grid-container"
+                        )
+                # 问题内容显示区域 - 左右分布
+                with gr.Row(variant='compact'):
+                    # 左侧：问题信息
+                    with gr.Column(scale=1):
+                        gr.Markdown("#### Problem (EN)")
+                        ref_problem_en_output = gr.Markdown(
+                            "Please select a problem.",
+                            latex_delimiters=[
+                                {"left": "$", "right": "$", "display": False},
+                                {"left": "$$", "right": "$$", "display": True},
+                                {"left": "\\(", "right": "\\)", "display": False},
+                                {"left": "\\[", "right": "\\]", "display": True}
+                            ]
+                        )
+                        gr.Markdown("#### Problem (ZH)")
+                        ref_problem_zh_output = gr.Markdown(
+                            "Please select a problem.",
+                            latex_delimiters=[
+                                {"left": "$", "right": "$", "display": False},
+                                {"left": "$$", "right": "$$", "display": True},
+                                {"left": "\\(", "right": "\\)", "display": False},
+                                {"left": "\\[", "right": "\\]", "display": True}
+                            ]
+                        )
+                        gr.Markdown("#### Subject")
+                        ref_subject_output = gr.Markdown("Please select a problem.")
+                        gr.Markdown("#### Answer")
+                        ref_answer_output = gr.Markdown(
+                            "Please select a problem.",
+                            latex_delimiters=[
+                                {"left": "$", "right": "$", "display": False},
+                                {"left": "$$", "right": "$$", "display": True},
+                                {"left": "\\(", "right": "\\)", "display": False},
+                                {"left": "\\[", "right": "\\]", "display": True}
+                            ]
+                        )
+                    # 右侧：解答
+                    with gr.Column(scale=1):
+                        gr.Markdown("#### Solution")
+                        ref_solution_output = gr.Markdown(
+                            "Please select a problem.",
+                            elem_id="ref-solution",
+                            latex_delimiters=[
+                                {"left": "$", "right": "$", "display": False},
+                                {"left": "$$", "right": "$$", "display": True},
+                                {"left": "\\(", "right": "\\)", "display": False},
+                                {"left": "\\[", "right": "\\]", "display": True},
+                                {"left": "\\begin{align}", "right": "\\end{align}", "display": True},
+                                {"left": "\\begin{aligned}", "right": "\\end{aligned}", "display": True},
+                                {"left": "\\begin{equation}", "right": "\\end{equation}", "display": True}
+                            ]
+                        )
         # --- Event Handlers ---
         def update_available_models_for_dropdowns(selected_dataset):
             # This function can be used to update model lists if they are dataset-dependent
             outputs=[sample_number_input]
         )
+        # 为引用解决方案标签页添加处理器
+        # 初始化引用问题网格
+        demo.load(
+            fn=lambda: create_reference_problem_grid_html(reference_loader, db),
+            inputs=[],
+            outputs=[ref_problem_grid_html_output]
+        )
+        # 引用问题选择事件
+        ref_problem_state_input.change(
+            fn=handle_reference_problem_select,
+            inputs=[ref_problem_state_input, gr.State(reference_loader)],
+            outputs=[ref_problem_en_output, ref_problem_zh_output, ref_subject_output, ref_answer_output, ref_solution_output]
+        )
+        # This is the crucial link: problem_state_input is changed by user, triggers this Python callback.
+        problem_state_input.change(
+            fn=handle_problem_select,
+            inputs=[problem_state_input, current_model_state, current_dataset_state],
+            outputs=[problem_markdown_output, answer_markdown_output, samples_grid_output, current_samples_data_state]
+        ).then(
+            # 重置Sample Number为0
+            fn=lambda: "0",
+            inputs=[],
+            outputs=[sample_number_input]
+        ).then(
+            fn=handle_first_sample,
+            inputs=[current_samples_data_state],
+            outputs=[sample_metadata_output, sample_response_output]
+        )
         return demo
 def monitor_memory_usage():
     except Exception as e:
         return "Memory monitor error"
+def create_reference_problem_grid_html(reference_loader, db):
+    """Create HTML for reference problem grid with average accuracies (using cache)"""
+    global reference_accuracy_cache
+    if not db:
+        return "<div>Database not available.</div>"
+    if not reference_loader:
+        return "<div><strong>No reference data available.</strong><br>Please ensure <code>extra.jsonl</code> file is in the same directory as the database file or in the current working directory.</div>"
+    problem_ids = reference_loader.get_all_problem_ids()
+    if not problem_ids:
+        return "<div>No reference problems found in extra.jsonl file.</div>"
+    # 如果缓存为空，返回加载提示
+    if not reference_accuracy_cache:
+        return "<div><strong>Computing problem accuracies...</strong><br>This may take a moment on first load.</div>"
+    print(f"Using cached accuracies for {len(problem_ids)} reference problems")
+    # 创建两行网格：第一行英文，第二行中文
+    custom_style = "<style>.ref-problem-btn, .ref-problem-btn div { color: white !important; }</style>"
+    html_en = ""
+    html_zh = ""
+    # 按数字顺序排序
+    sorted_problem_ids = sorted(problem_ids, key=int)
+    for pid in sorted_problem_ids:
+        # 从缓存获取准确率
+        accuracy_data = reference_accuracy_cache.get(pid, {"EN": 0.0, "ZH": 0.0})
+        en_acc = accuracy_data["EN"]
+        zh_acc = accuracy_data["ZH"]
+        # 英文版本按钮
+        en_bg_color = get_gradient_color(en_acc)
+        en_acc_pct = int(en_acc * 100)
+        html_en += f"""
+        <div
+            data-problem-id="{pid}"
+            class="ref-problem-btn"
+            title="ID: {pid} (EN) - Avg Acc: {en_acc_pct}%"
+            style='background-color: {en_bg_color}; color: white !important;
+                   border-radius: 4px; padding: 5px; text-align: center; font-size: 0.7em;
+                   min-height: 36px; user-select: none; width: 100%;
+                   display: flex; flex-direction: column; justify-content: center;
+                   overflow: hidden; text-overflow: ellipsis; white-space: nowrap; cursor: pointer;'>
+            <div style="font-weight: bold; color: white !important;">{pid}</div>
+            <div style="color: white !important;">{en_acc_pct}%</div>
+        </div>
+        """
+        # 中文版本按钮
+        zh_bg_color = get_gradient_color(zh_acc)
+        zh_acc_pct = int(zh_acc * 100)
+        html_zh += f"""
+        <div
+            data-problem-id="{pid}"
+            class="ref-problem-btn"
+            title="ID: {pid} (ZH) - Avg Acc: {zh_acc_pct}%"
+            style='background-color: {zh_bg_color}; color: white !important;
+                   border-radius: 4px; padding: 5px; text-align: center; font-size: 0.7em;
+                   min-height: 36px; user-select: none; width: 100%;
+                   display: flex; flex-direction: column; justify-content: center;
+                   overflow: hidden; text-overflow: ellipsis; white-space: nowrap; cursor: pointer;'>
+            <div style="font-weight: bold; color: white !important;">{pid}</div>
+            <div style="color: white !important;">{zh_acc_pct}%</div>
+        </div>
+        """
+    # 计算网格列数（根据问题数量）
+    grid_cols = len(sorted_problem_ids) if len(sorted_problem_ids) <= 30 else 30
+    # 组合成完整的HTML
+    grid_html = f"""
+    {custom_style}
+    <div style='margin-bottom: 10px;'>
+        <div style='display: grid; grid-template-columns: repeat({grid_cols}, 1fr); gap: 2px;'>{html_en}</div>
+    </div>
+    <div>
+        <div style='display: grid; grid-template-columns: repeat({grid_cols}, 1fr); gap: 2px;'>{html_zh}</div>
+    </div>
+    """
+    return grid_html
+def handle_reference_problem_select(problem_id, reference_loader):
+    """Handle reference problem selection and display all information"""
+    if not problem_id or not reference_loader:
+        return ("Please select a problem.", "Please select a problem.",
+                "Please select a problem.", "Please select a problem.", "Please select a problem.")
+    try:
+        problem_id_int = int(problem_id)
+    except ValueError:
+        return ("Please enter a valid problem ID.", "Please enter a valid problem ID.",
+                "Please enter a valid problem ID.", "Please enter a valid problem ID.", "Please enter a valid problem ID.")
+    reference_data = reference_loader.get_problem_data(problem_id_int)
+    if not reference_data:
+        error_msg = f"Problem {problem_id_int} not found in reference data."
+        return (error_msg, error_msg, "No subject available.", "No answer available.", "Solution not available.")
+    # 格式化各个部分
+    en_problem = format_markdown_with_math(reference_data.get('en_problem', 'Problem (EN) not available.'))
+    zh_problem = format_markdown_with_math(reference_data.get('zh_problem', 'Problem (ZH) not available.'))
+    # 处理答案格式 - 使用特殊的答案格式处理
+    answer_text = reference_data.get('answer', 'No answer available.')
+    answer = format_answer_with_math(answer_text)
+    # 科目显示
+    subject_en = reference_data.get('subject', 'Unknown')
+    subject_zh = SUBJECT_TRANS_EN_TO_ZH.get(subject_en, subject_en)
+    subject_display = f"**{subject_en}** / **{subject_zh}**"
+    # Solution - 使用solution字段，通常是中文解答
+    solution_text = reference_data.get('solution', 'Solution not available.')
+    if solution_text != 'Solution not available.':
+        solution = format_solution_latex(solution_text)
+    else:
+        solution = solution_text
+    return (en_problem, zh_problem, subject_display, answer, solution)
+def test_reference_data_availability(db, reference_loader):
+    """Test function to check if reference data is available"""
+    print("=== Reference Data Availability Test ===")
+    # Test database
+    if not db:
+        print("❌ Database is not available")
+        return False
+    # Check database schema
+    try:
+        cursor = db.conn.cursor()
+        cursor.execute("SELECT name FROM sqlite_master WHERE type='table'")
+        tables = [row[0] for row in cursor.fetchall()]
+        print(f"✅ Database tables: {tables}")
+        # Check problems table
+        cursor.execute("SELECT COUNT(*) FROM problems")
+        problem_count = cursor.fetchone()[0]
+        print(f"✅ Problems table: {problem_count} problems")
+        # Check responses table
+        cursor.execute("SELECT COUNT(*) FROM responses")
+        response_count = cursor.fetchone()[0]
+        print(f"✅ Responses table: {response_count} responses")
+        # Check unique datasets
+        cursor.execute("SELECT DISTINCT dataset FROM responses")
+        datasets = [row[0] for row in cursor.fetchall()]
+        print(f"✅ Available datasets: {datasets}")
+        # Check some sample unique_ids from problems
+        cursor.execute("SELECT unique_id FROM problems LIMIT 10")
+        sample_ids = [row[0] for row in cursor.fetchall()]
+        print(f"✅ Sample problem unique_ids: {sample_ids}")
+    except Exception as e:
+        print(f"❌ Error checking database schema: {e}")
+    models = db.get_available_models()
+    print(f"✅ Database connected: {len(models)} models available")
+    # Test reference loader
+    if not reference_loader:
+        print("❌ Reference loader is not available (extra.jsonl not found)")
+        return False
+    problem_ids = reference_loader.get_all_problem_ids()
+    print(f"✅ Reference loader: {len(problem_ids)} problems available: {problem_ids}")
+    # Test a specific problem (simplified test)
+    if problem_ids:
+        test_id = problem_ids[0]
+        en_unique_id = f"OlymMATH-HARD-{test_id}-EN"
+        zh_unique_id = f"OlymMATH-HARD-{test_id}-ZH"
+        print(f"Testing with constructed IDs: {en_unique_id}, {zh_unique_id}")
+        # Check if problems exist in database
+        problem_en, responses_en = db.get_problem_data(None, "EN-HARD", en_unique_id)
+        problem_zh, responses_zh = db.get_problem_data(None, "ZH-HARD", zh_unique_id)
+        print(f"Test problem {test_id}:")
+        print(f"  EN problem exists: {problem_en is not None}")
+        print(f"  ZH problem exists: {problem_zh is not None}")
+        if responses_en:
+            print(f"  EN responses: {len(responses_en)} found")
+        if responses_zh:
+            print(f"  ZH responses: {len(responses_zh)} found")
+    print("=== End Test ===")
+    return True
+def test_latex_formatting():
+    """Test function to verify LaTeX environment processing"""
+    test_text = """
+易知，1, 4, 6, 7, 9 这五个数中的任意两个数之差均不为 4 或 7.
+$$
+\\begin{aligned}
+\\sum_{n=1}^{2023}f_{n} &= \\sum_{k=0}^{183}\\sum_{i=0}^{10}f_{11k+i} \\\\
+&= \\sum_{k=0}^{183}(11 \\times 5k+1+2+3+5 \\times 4+2 \\times 5) \\\\
+&= 55 \\times \\frac{183 \\times 184}{2}+184 \\times 36 \\\\
+&= 932604.
+\\end{aligned}
+$$
+故答案为：$\\boxed{932604}$.
+"""
+    formatted = format_markdown_with_math(test_text)
+    print("=== LaTeX Formatting Test ===")
+    print("Original text contains \\begin{aligned}:", "\\begin{aligned}" in test_text)
+    print("Formatted text contains \\begin{aligned}:", "\\begin{aligned}" in formatted)
+    print("Formatted text (first 300 chars):", formatted[:300])
+    print("=== End Test ===")
+    return formatted
+def format_solution_latex(text):
+    """Preprocess solution text by converting LaTeX delimiters from MathJax to KaTeX format"""
+    if text is None:
+        return ""
+    # Convert $$xxx$$ to \[xxx\] (display math)
+    # Use non-greedy matching and handle multiple lines
+    text = re.sub(r'\$\$(.*?)\$\$', r'\\[\1\\]', text, flags=re.DOTALL)
+    # Convert $xxx$ to \(xxx\) (inline math)
+    # Be careful not to match already converted \[...\] content
+    text = re.sub(r'(?<!\\)\$([^$\n]+?)\$(?!\])', r'\\(\1\\)', text)
+    # Convert newlines for markdown
+    text = text.replace('\r\n', '\n').replace('\r', '\n')
+    # Clean up excessive newlines
+    text = re.sub(r'\n\s*\n\s*\n+', '\n\n', text)
+    return text
+def format_answer_with_math(text):
+    """Special formatting for answer fields - manually wrap with \(\) delimiters"""
+    if text is None or text.strip() == "" or text == "No answer available.":
+        return text
+    # Convert newlines for markdown
+    text = text.replace('\r\n', '\n').replace('\r', '\n')
+    # Convert $$xxx$$ to $xxx$ first (same as before)
+    text = re.sub(r'\$\$(.*?)\$\$', r'$\1$', text, flags=re.DOTALL)
+    # Check if answer already contains dollar signs, if not add them
+    if '$' not in text and text.strip():
+        text = f"${text}$"
+    # Now convert $xxx$ to \(xxx\) for proper rendering
+    text = re.sub(r'(?<!\\)\$([^$\n]+?)\$', r'\\(\1\\)', text)
+    # Clean up excessive newlines
+    text = re.sub(r'\n\s*\n\s*\n+', '\n\n', text)
+    return text
 # 修改主函数以使用优化策略
 if __name__ == "__main__":
     DB_PATH = "data.db"
     # 检查数据库文件是否存在，如果不存在则从 Hugging Face 下载
     if not os.path.exists(DB_PATH):
         try:
             DB_PATH = hf_hub_download(
                 repo_id="CoderBak/OlymMATH-data",
                 filename="data.db",
+                repo_type="dataset"
             )
         except Exception as e:
             # 创建一个显示错误信息的简单 Gradio 应用
             with gr.Blocks() as error_demo:
+                gr.Markdown(f"# Error: Database Download Failed\n{str(e)}")
             error_demo.launch(server_name="0.0.0.0")
             exit(1)