Hatesocial / app.py
jenniferhk008's picture
Update app.py
d9ba741 verified
raw
history blame
6.4 kB
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
import torch
import streamlit as st
from PIL import Image
import pytesseract
import openai
import pandas as pd
import plotly.express as px
# โœ… Step 1: Emoji ็ฟป่ฏ‘ๆจกๅž‹๏ผˆไฝ ่‡ชๅทฑ่ฎญ็ปƒ็š„ๆจกๅž‹๏ผ‰
emoji_model_id = "JenniferHJF/qwen1.5-emoji-finetuned"
emoji_tokenizer = AutoTokenizer.from_pretrained(emoji_model_id, trust_remote_code=True)
emoji_model = AutoModelForCausalLM.from_pretrained(
emoji_model_id,
trust_remote_code=True,
torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
).to("cuda" if torch.cuda.is_available() else "cpu")
emoji_model.eval()
# โœ… Step 2: ๅฏ้€‰ๆ‹ฉ็š„ๅ†’็Šฏๆ€งๆ–‡ๆœฌ่ฏ†ๅˆซๆจกๅž‹
model_options = {
"Toxic-BERT": "unitary/toxic-bert",
"Roberta Offensive": "cardiffnlp/twitter-roberta-base-offensive",
"BERT Emotion": "bhadresh-savani/bert-base-go-emotion"
}
# โœ… ้กต้ข้…็ฝฎ
st.set_page_config(page_title="Emoji Offensive Text Detector", page_icon="๐Ÿšจ", layout="wide")
# โœ… ้กต้ขๅธƒๅฑ€
with st.sidebar:
st.header("๐Ÿง  Navigation")
section = st.radio("Select Mode:", ["๐Ÿ“ Text Moderation", "๐Ÿ“Š Text Analysis", "๐Ÿ› ๏ธ Agent Build"])
if section == "๐Ÿ“ Text Moderation":
moderation_type = st.selectbox("Select Task Type", ["Normal Text", "Bullet Screen Text"])
selected_model = st.selectbox("Choose classification model", list(model_options.keys()))
selected_model_id = model_options[selected_model]
classifier = pipeline("text-classification", model=selected_model_id, device=0 if torch.cuda.is_available() else -1)
elif section == "๐Ÿ“Š Text Analysis":
st.markdown("You can view the violation distribution chart and editing suggestions.")
elif section == "๐Ÿ› ๏ธ Agent Build":
st.markdown("Upload supporting files for in-context fine-tuning.")
uploaded_reference = st.file_uploader("Upload fine-tuning reference file", type=["txt", "csv"])
# โœ… ๅญ˜ๅ‚จๅˆ†ๆžๅކๅฒ่ฎฐๅฝ•
history = []
def classify_emoji_text(text: str):
prompt = f"่พ“ๅ…ฅ๏ผš{text}\n่พ“ๅ‡บ๏ผš"
input_ids = emoji_tokenizer(prompt, return_tensors="pt").to(emoji_model.device)
with torch.no_grad():
output_ids = emoji_model.generate(**input_ids, max_new_tokens=64, do_sample=False)
decoded = emoji_tokenizer.decode(output_ids[0], skip_special_tokens=True)
translated_text = decoded.split("่พ“ๅ‡บ๏ผš")[-1].strip() if "่พ“ๅ‡บ๏ผš" in decoded else decoded.strip()
result = classifier(translated_text)[0]
label = result["label"]
score = result["score"]
reasoning = f"The sentence was flagged as '{label}' due to potentially offensive phrases. Consider replacing emotionally charged, ambiguous, or abusive terms."
history.append({"text": text, "translated": translated_text, "label": label, "score": score, "reason": reasoning})
return translated_text, label, score, reasoning
# โœ… Section logic
if section == "๐Ÿ“ Text Moderation":
st.title("๐Ÿ“ Offensive Text Classification")
st.markdown("### โœ๏ธ Input your sentence:")
default_text = "ไฝ ๆ˜ฏ๐Ÿท"
text = st.text_area("Enter sentence with emojis:", value=default_text, height=150)
if st.button("๐Ÿšฆ Analyze"):
with st.spinner("๐Ÿ” Processing..."):
try:
translated, label, score, reason = classify_emoji_text(text)
st.markdown("### ๐Ÿ”„ Translated sentence:")
st.code(translated, language="text")
st.markdown(f"### ๐ŸŽฏ Prediction: `{label}`")
st.markdown(f"### ๐Ÿ“Š Confidence Score: `{score:.2%}`")
st.markdown(f"### ๐Ÿง  Model Explanation:")
st.info(reason)
except Exception as e:
st.error(f"โŒ An error occurred during processing:\n\n{e}")
st.markdown("---")
st.markdown("### ๐Ÿ–ผ๏ธ Or upload a screenshot of bullet comments:")
uploaded_file = st.file_uploader("Upload an image (JPG/PNG)", type=["jpg", "jpeg", "png"])
if uploaded_file is not None:
image = Image.open(uploaded_file)
st.image(image, caption="Uploaded Screenshot", use_column_width=True)
with st.spinner("๐Ÿง  Extracting text via OCR..."):
ocr_text = pytesseract.image_to_string(image, lang="chi_sim+eng")
st.markdown("#### ๐Ÿ“‹ Extracted Text:")
st.code(ocr_text.strip())
translated, label, score, reason = classify_emoji_text(ocr_text.strip())
st.markdown("### ๐Ÿ”„ Translated sentence:")
st.code(translated, language="text")
st.markdown(f"### ๐ŸŽฏ Prediction: `{label}`")
st.markdown(f"### ๐Ÿ“Š Confidence Score: `{score:.2%}`")
st.markdown("### ๐Ÿง  Model Explanation:")
st.info(reason)
elif section == "๐Ÿ“Š Text Analysis":
st.title("๐Ÿ“Š Violation Analysis Dashboard")
if history:
df = pd.DataFrame(history)
label_counts = df["label"].value_counts().reset_index()
label_counts.columns = ["Category", "Count"]
fig = px.pie(label_counts, names="Category", values="Count", title="Offensive Category Distribution", color_discrete_sequence=px.colors.sequential.RdBu)
st.plotly_chart(fig)
st.markdown("### ๐Ÿงพ Offensive Terms & Suggestions")
for item in history:
st.markdown(f"- ๐Ÿ”น **Input:** `{item['text']}`")
st.markdown(f" - โœจ **Translated:** `{item['translated']}`")
st.markdown(f" - โ— **Label:** `{item['label']}` with **{item['score']:.2%}** confidence")
st.markdown(f" - ๐Ÿ”ง **Suggestion:** {item['reason']}")
else:
st.info("โš ๏ธ No classification data available yet.")
elif section == "๐Ÿ› ๏ธ Agent Build":
st.title("๐Ÿ› ๏ธ Agent Assistant for Text Classification")
st.markdown("Upload context files and interact with an assistant to guide text moderation.")
if uploaded_reference is not None:
content = uploaded_reference.read().decode("utf-8")
st.text_area("๐Ÿ“„ Uploaded Reference Preview:", content, height=300)
prompt = st.text_area("๐Ÿ’ฌ Ask the Assistant Anything:", "How can I improve detection on emotional slang?")
if st.button("๐Ÿ’ก Analyze with Agent"):
st.info("(This is a placeholder for future integration with a fine-tuned LLM or API call.)")