Spaces:
Running
Running
Erva Ulusoy
commited on
Commit
·
a1e2231
1
Parent(s):
14c3500
added filtering options for second-degree edge visualization
Browse files- ProtHGT_app.py +48 -13
- visualize_kg.py +26 -8
ProtHGT_app.py
CHANGED
|
@@ -562,18 +562,28 @@ if st.session_state.submitted:
|
|
| 562 |
# Create visualizations in each tab
|
| 563 |
for idx, protein_id in enumerate(selected_proteins):
|
| 564 |
with protein_tabs[idx]:
|
| 565 |
-
col1, col2 = st.columns([
|
| 566 |
with col1:
|
| 567 |
max_node_count = st.slider(
|
| 568 |
-
"Maximum neighbors per edge type",
|
| 569 |
min_value=5,
|
| 570 |
max_value=50,
|
| 571 |
value=10,
|
| 572 |
step=5,
|
| 573 |
-
help="Control the maximum number of
|
| 574 |
key=f"slider_{protein_id}"
|
| 575 |
)
|
| 576 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 577 |
# Check if both visualizations exist for this protein
|
| 578 |
viz_exists = (protein_id in st.session_state.protein_visualizations and
|
| 579 |
'first_degree' in st.session_state.protein_visualizations[protein_id] and
|
|
@@ -592,6 +602,7 @@ if st.session_state.submitted:
|
|
| 592 |
protein_id,
|
| 593 |
st.session_state.predictions_df,
|
| 594 |
limit=max_node_count,
|
|
|
|
| 595 |
include_second_degree=False
|
| 596 |
)
|
| 597 |
|
|
@@ -601,29 +612,48 @@ if st.session_state.submitted:
|
|
| 601 |
protein_id,
|
| 602 |
st.session_state.predictions_df,
|
| 603 |
limit=max_node_count,
|
|
|
|
| 604 |
include_second_degree=True
|
| 605 |
)
|
| 606 |
|
| 607 |
# Store both visualizations in session state
|
| 608 |
st.session_state.protein_visualizations[protein_id]['first_degree'] = {
|
| 609 |
'path': html_path_1st,
|
| 610 |
-
'edges': edges_1st
|
|
|
|
|
|
|
|
|
|
|
|
|
| 611 |
}
|
| 612 |
st.session_state.protein_visualizations[protein_id]['second_degree'] = {
|
| 613 |
'path': html_path_2nd,
|
| 614 |
-
'edges': edges_2nd
|
|
|
|
|
|
|
|
|
|
|
|
|
| 615 |
}
|
| 616 |
st.rerun()
|
| 617 |
|
| 618 |
# If visualization exists, show the toggle and display appropriate version
|
| 619 |
if viz_exists:
|
| 620 |
-
|
| 621 |
-
|
| 622 |
-
|
| 623 |
-
|
| 624 |
-
|
| 625 |
-
|
| 626 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 627 |
|
| 628 |
# Get the appropriate visualization based on checkbox
|
| 629 |
viz_type = 'second_degree' if include_second_degree else 'first_degree'
|
|
@@ -663,6 +693,11 @@ if st.session_state.submitted:
|
|
| 663 |
del st.session_state.protein_visualizations[protein_id]
|
| 664 |
st.rerun()
|
| 665 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 666 |
# Display the appropriate visualization
|
| 667 |
with open(viz_info['path'], 'r', encoding='utf-8') as f:
|
| 668 |
html_content = f.read()
|
|
|
|
| 562 |
# Create visualizations in each tab
|
| 563 |
for idx, protein_id in enumerate(selected_proteins):
|
| 564 |
with protein_tabs[idx]:
|
| 565 |
+
col1, col2 = st.columns([1, 1])
|
| 566 |
with col1:
|
| 567 |
max_node_count = st.slider(
|
| 568 |
+
"Maximum neighbors per edge type (first-degree)",
|
| 569 |
min_value=5,
|
| 570 |
max_value=50,
|
| 571 |
value=10,
|
| 572 |
step=5,
|
| 573 |
+
help="Control the maximum number of direct neighbors of the query protein shown for each relationship type",
|
| 574 |
key=f"slider_{protein_id}"
|
| 575 |
)
|
| 576 |
+
with col2:
|
| 577 |
+
second_degree_limit = st.slider(
|
| 578 |
+
"Maximum neighbors per edge type (second-degree)",
|
| 579 |
+
min_value=2,
|
| 580 |
+
max_value=10,
|
| 581 |
+
value=3,
|
| 582 |
+
step=1,
|
| 583 |
+
help="Control the maximum number of second-degree neighbors of the query protein shown for each relationship type. Second-degree edge limit is intentionally kept low to maintain visual clarity. Higher values may make the graph cluttered and difficult to interpret.",
|
| 584 |
+
key=f"second_degree_slider_{protein_id}"
|
| 585 |
+
)
|
| 586 |
+
|
| 587 |
# Check if both visualizations exist for this protein
|
| 588 |
viz_exists = (protein_id in st.session_state.protein_visualizations and
|
| 589 |
'first_degree' in st.session_state.protein_visualizations[protein_id] and
|
|
|
|
| 602 |
protein_id,
|
| 603 |
st.session_state.predictions_df,
|
| 604 |
limit=max_node_count,
|
| 605 |
+
second_degree_limit=second_degree_limit,
|
| 606 |
include_second_degree=False
|
| 607 |
)
|
| 608 |
|
|
|
|
| 612 |
protein_id,
|
| 613 |
st.session_state.predictions_df,
|
| 614 |
limit=max_node_count,
|
| 615 |
+
second_degree_limit=second_degree_limit,
|
| 616 |
include_second_degree=True
|
| 617 |
)
|
| 618 |
|
| 619 |
# Store both visualizations in session state
|
| 620 |
st.session_state.protein_visualizations[protein_id]['first_degree'] = {
|
| 621 |
'path': html_path_1st,
|
| 622 |
+
'edges': edges_1st,
|
| 623 |
+
'settings': {
|
| 624 |
+
'max_node_count': max_node_count,
|
| 625 |
+
'second_degree_limit': second_degree_limit
|
| 626 |
+
}
|
| 627 |
}
|
| 628 |
st.session_state.protein_visualizations[protein_id]['second_degree'] = {
|
| 629 |
'path': html_path_2nd,
|
| 630 |
+
'edges': edges_2nd,
|
| 631 |
+
'settings': {
|
| 632 |
+
'max_node_count': max_node_count,
|
| 633 |
+
'second_degree_limit': second_degree_limit
|
| 634 |
+
}
|
| 635 |
}
|
| 636 |
st.rerun()
|
| 637 |
|
| 638 |
# If visualization exists, show the toggle and display appropriate version
|
| 639 |
if viz_exists:
|
| 640 |
+
# Check if settings have changed, but handle cases where settings don't exist
|
| 641 |
+
current_settings = {
|
| 642 |
+
'max_node_count': max_node_count,
|
| 643 |
+
'second_degree_limit': second_degree_limit
|
| 644 |
+
}
|
| 645 |
+
|
| 646 |
+
# Safely get stored settings or use None if they don't exist
|
| 647 |
+
stored_settings = (st.session_state.protein_visualizations[protein_id]['first_degree'].get('settings')
|
| 648 |
+
if 'first_degree' in st.session_state.protein_visualizations[protein_id]
|
| 649 |
+
else None)
|
| 650 |
+
|
| 651 |
+
include_second_degree = st.checkbox(
|
| 652 |
+
"Include second-degree edges",
|
| 653 |
+
value=False,
|
| 654 |
+
key=f"second_degree_{protein_id}",
|
| 655 |
+
help="Show connections between neighbor nodes"
|
| 656 |
+
)
|
| 657 |
|
| 658 |
# Get the appropriate visualization based on checkbox
|
| 659 |
viz_type = 'second_degree' if include_second_degree else 'first_degree'
|
|
|
|
| 693 |
del st.session_state.protein_visualizations[protein_id]
|
| 694 |
st.rerun()
|
| 695 |
|
| 696 |
+
if stored_settings is not None and current_settings != stored_settings:
|
| 697 |
+
st.warning("⚠️ Settings have changed. Click 'Regenerate Visualization' to apply new settings.")
|
| 698 |
+
elif stored_settings is None:
|
| 699 |
+
st.warning("⚠️ Visualization was generated with default settings. Consider regenerating to apply custom settings.")
|
| 700 |
+
|
| 701 |
# Display the appropriate visualization
|
| 702 |
with open(viz_info['path'], 'r', encoding='utf-8') as f:
|
| 703 |
html_content = f.read()
|
visualize_kg.py
CHANGED
|
@@ -131,9 +131,23 @@ def _gather_protein_edges(data, protein_id):
|
|
| 131 |
|
| 132 |
return protein_edges
|
| 133 |
|
| 134 |
-
def _filter_edges(protein_id, protein_edges, prediction_df, limit=10):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 135 |
filtered_edges = {}
|
| 136 |
|
|
|
|
|
|
|
|
|
|
| 137 |
prediction_categories = prediction_df['GO_category'].unique()
|
| 138 |
prediction_categories = [GO_CATEGORY_MAPPING[category] for category in prediction_categories]
|
| 139 |
go_category_reverse_mapping = {v:k for k, v in GO_CATEGORY_MAPPING.items()}
|
|
@@ -160,18 +174,18 @@ def _filter_edges(protein_id, protein_edges, prediction_df, limit=10):
|
|
| 160 |
edge = (protein_id, term)
|
| 161 |
is_ground_truth = edge in edges_set
|
| 162 |
valid_edges.append((edge, prob, is_ground_truth))
|
| 163 |
-
if len(valid_edges) >=
|
| 164 |
break
|
| 165 |
filtered_edges[edge_type] = valid_edges
|
| 166 |
else:
|
| 167 |
# If no predictions but it's a GO category in prediction_df
|
| 168 |
-
filtered_edges[edge_type] = [(edge, 'no_pred', True) for edge in list(edges)[:
|
| 169 |
else:
|
| 170 |
# For GO terms not in prediction_df, mark them as ground truth with blue color
|
| 171 |
-
filtered_edges[edge_type] = [(edge, 'no_pred', True) for edge in list(edges)[:
|
| 172 |
else:
|
| 173 |
# For non-GO edges, include all edges up to limit
|
| 174 |
-
filtered_edges[edge_type] = [(edge, None, True) for edge in list(edges)[:
|
| 175 |
|
| 176 |
return filtered_edges
|
| 177 |
|
|
@@ -225,7 +239,8 @@ def visualize_protein_subgraph(data, protein_id, prediction_df, limit=10, second
|
|
| 225 |
|
| 226 |
# Get the first-degree edges and filter them
|
| 227 |
protein_edges = _gather_protein_edges(data, protein_id)
|
| 228 |
-
first_degree_edges = _filter_edges(protein_id, protein_edges, prediction_df,
|
|
|
|
| 229 |
|
| 230 |
# Initialize all_edges with first degree edges
|
| 231 |
all_edges = first_degree_edges.copy()
|
|
@@ -243,11 +258,14 @@ def visualize_protein_subgraph(data, protein_id, prediction_df, limit=10, second
|
|
| 243 |
if target != protein_id:
|
| 244 |
neighbor_nodes.add((target, target_type))
|
| 245 |
|
| 246 |
-
# Gather and filter second-degree edges
|
| 247 |
second_degree_edges = {}
|
| 248 |
for neighbor_id, neighbor_type in neighbor_nodes:
|
| 249 |
neighbor_edges = _gather_neighbor_edges(data, neighbor_id, neighbor_type, protein_id)
|
| 250 |
-
filtered_neighbor_edges = _filter_edges(neighbor_id, neighbor_edges, prediction_df,
|
|
|
|
|
|
|
|
|
|
| 251 |
|
| 252 |
# Merge filtered neighbor edges into second_degree_edges
|
| 253 |
for edge_type, edges in filtered_neighbor_edges.items():
|
|
|
|
| 131 |
|
| 132 |
return protein_edges
|
| 133 |
|
| 134 |
+
def _filter_edges(protein_id, protein_edges, prediction_df, limit=10, is_second_degree=False, second_degree_limit=3):
|
| 135 |
+
"""
|
| 136 |
+
Filter edges based on type and limit
|
| 137 |
+
|
| 138 |
+
Args:
|
| 139 |
+
protein_id: ID of the protein
|
| 140 |
+
protein_edges: Dictionary of edges to filter
|
| 141 |
+
prediction_df: DataFrame containing predictions
|
| 142 |
+
limit: Maximum number of edges to keep for first-degree connections
|
| 143 |
+
is_second_degree: Whether these are second-degree edges
|
| 144 |
+
second_degree_limit: Maximum number of edges to keep for second-degree connections
|
| 145 |
+
"""
|
| 146 |
filtered_edges = {}
|
| 147 |
|
| 148 |
+
# Use appropriate limit based on edge degree
|
| 149 |
+
current_limit = second_degree_limit if is_second_degree else limit
|
| 150 |
+
|
| 151 |
prediction_categories = prediction_df['GO_category'].unique()
|
| 152 |
prediction_categories = [GO_CATEGORY_MAPPING[category] for category in prediction_categories]
|
| 153 |
go_category_reverse_mapping = {v:k for k, v in GO_CATEGORY_MAPPING.items()}
|
|
|
|
| 174 |
edge = (protein_id, term)
|
| 175 |
is_ground_truth = edge in edges_set
|
| 176 |
valid_edges.append((edge, prob, is_ground_truth))
|
| 177 |
+
if len(valid_edges) >= current_limit:
|
| 178 |
break
|
| 179 |
filtered_edges[edge_type] = valid_edges
|
| 180 |
else:
|
| 181 |
# If no predictions but it's a GO category in prediction_df
|
| 182 |
+
filtered_edges[edge_type] = [(edge, 'no_pred', True) for edge in list(edges)[:current_limit]]
|
| 183 |
else:
|
| 184 |
# For GO terms not in prediction_df, mark them as ground truth with blue color
|
| 185 |
+
filtered_edges[edge_type] = [(edge, 'no_pred', True) for edge in list(edges)[:current_limit]]
|
| 186 |
else:
|
| 187 |
# For non-GO edges, include all edges up to limit
|
| 188 |
+
filtered_edges[edge_type] = [(edge, None, True) for edge in list(edges)[:current_limit]]
|
| 189 |
|
| 190 |
return filtered_edges
|
| 191 |
|
|
|
|
| 239 |
|
| 240 |
# Get the first-degree edges and filter them
|
| 241 |
protein_edges = _gather_protein_edges(data, protein_id)
|
| 242 |
+
first_degree_edges = _filter_edges(protein_id, protein_edges, prediction_df,
|
| 243 |
+
limit=limit, is_second_degree=False)
|
| 244 |
|
| 245 |
# Initialize all_edges with first degree edges
|
| 246 |
all_edges = first_degree_edges.copy()
|
|
|
|
| 258 |
if target != protein_id:
|
| 259 |
neighbor_nodes.add((target, target_type))
|
| 260 |
|
| 261 |
+
# Gather and filter second-degree edges with the smaller limit
|
| 262 |
second_degree_edges = {}
|
| 263 |
for neighbor_id, neighbor_type in neighbor_nodes:
|
| 264 |
neighbor_edges = _gather_neighbor_edges(data, neighbor_id, neighbor_type, protein_id)
|
| 265 |
+
filtered_neighbor_edges = _filter_edges(neighbor_id, neighbor_edges, prediction_df,
|
| 266 |
+
limit=limit,
|
| 267 |
+
is_second_degree=True,
|
| 268 |
+
second_degree_limit=second_degree_limit)
|
| 269 |
|
| 270 |
# Merge filtered neighbor edges into second_degree_edges
|
| 271 |
for edge_type, edges in filtered_neighbor_edges.items():
|