Spaces:
Sleeping
Sleeping
| from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM | |
| import torch | |
| import streamlit as st | |
| from PIL import Image | |
| import pytesseract | |
| import openai | |
| import pandas as pd | |
| import plotly.express as px | |
| # โ Step 1: Emoji ็ฟป่ฏๆจกๅ๏ผไฝ ่ชๅทฑ่ฎญ็ป็ๆจกๅ๏ผ | |
| emoji_model_id = "JenniferHJF/qwen1.5-emoji-finetuned" | |
| emoji_tokenizer = AutoTokenizer.from_pretrained(emoji_model_id, trust_remote_code=True) | |
| emoji_model = AutoModelForCausalLM.from_pretrained( | |
| emoji_model_id, | |
| trust_remote_code=True, | |
| torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32 | |
| ).to("cuda" if torch.cuda.is_available() else "cpu") | |
| emoji_model.eval() | |
| # โ Step 2: ๅฏ้ๆฉ็ๅ็ฏๆงๆๆฌ่ฏๅซๆจกๅ | |
| model_options = { | |
| "Toxic-BERT": "unitary/toxic-bert", | |
| "Roberta Offensive": "cardiffnlp/twitter-roberta-base-offensive", | |
| "BERT Emotion": "bhadresh-savani/bert-base-go-emotion" | |
| } | |
| # โ ้กต้ข้ ็ฝฎ | |
| st.set_page_config(page_title="Emoji Offensive Text Detector", page_icon="๐จ", layout="wide") | |
| # โ ้กต้ขๅธๅฑ | |
| with st.sidebar: | |
| st.header("๐ง Navigation") | |
| section = st.radio("Select Mode:", ["๐ Text Moderation", "๐ Text Analysis", "๐ ๏ธ Agent Build"]) | |
| if section == "๐ Text Moderation": | |
| moderation_type = st.selectbox("Select Task Type", ["Normal Text", "Bullet Screen Text"]) | |
| selected_model = st.selectbox("Choose classification model", list(model_options.keys())) | |
| selected_model_id = model_options[selected_model] | |
| classifier = pipeline("text-classification", model=selected_model_id, device=0 if torch.cuda.is_available() else -1) | |
| elif section == "๐ Text Analysis": | |
| st.markdown("You can view the violation distribution chart and editing suggestions.") | |
| elif section == "๐ ๏ธ Agent Build": | |
| st.markdown("Upload supporting files for in-context fine-tuning.") | |
| uploaded_reference = st.file_uploader("Upload fine-tuning reference file", type=["txt", "csv"]) | |
| # โ ๅญๅจๅๆๅๅฒ่ฎฐๅฝ | |
| history = [] | |
| def classify_emoji_text(text: str): | |
| prompt = f"่พๅ ฅ๏ผ{text}\n่พๅบ๏ผ" | |
| input_ids = emoji_tokenizer(prompt, return_tensors="pt").to(emoji_model.device) | |
| with torch.no_grad(): | |
| output_ids = emoji_model.generate(**input_ids, max_new_tokens=64, do_sample=False) | |
| decoded = emoji_tokenizer.decode(output_ids[0], skip_special_tokens=True) | |
| translated_text = decoded.split("่พๅบ๏ผ")[-1].strip() if "่พๅบ๏ผ" in decoded else decoded.strip() | |
| result = classifier(translated_text)[0] | |
| label = result["label"] | |
| score = result["score"] | |
| reasoning = f"The sentence was flagged as '{label}' due to potentially offensive phrases. Consider replacing emotionally charged, ambiguous, or abusive terms." | |
| history.append({"text": text, "translated": translated_text, "label": label, "score": score, "reason": reasoning}) | |
| return translated_text, label, score, reasoning | |
| # โ Section logic | |
| if section == "๐ Text Moderation": | |
| st.title("๐ Offensive Text Classification") | |
| st.markdown("### โ๏ธ Input your sentence:") | |
| default_text = "ไฝ ๆฏ๐ท" | |
| text = st.text_area("Enter sentence with emojis:", value=default_text, height=150) | |
| if st.button("๐ฆ Analyze"): | |
| with st.spinner("๐ Processing..."): | |
| try: | |
| translated, label, score, reason = classify_emoji_text(text) | |
| st.markdown("### ๐ Translated sentence:") | |
| st.code(translated, language="text") | |
| st.markdown(f"### ๐ฏ Prediction: `{label}`") | |
| st.markdown(f"### ๐ Confidence Score: `{score:.2%}`") | |
| st.markdown(f"### ๐ง Model Explanation:") | |
| st.info(reason) | |
| except Exception as e: | |
| st.error(f"โ An error occurred during processing:\n\n{e}") | |
| st.markdown("---") | |
| st.markdown("### ๐ผ๏ธ Or upload a screenshot of bullet comments:") | |
| uploaded_file = st.file_uploader("Upload an image (JPG/PNG)", type=["jpg", "jpeg", "png"]) | |
| if uploaded_file is not None: | |
| image = Image.open(uploaded_file) | |
| st.image(image, caption="Uploaded Screenshot", use_column_width=True) | |
| with st.spinner("๐ง Extracting text via OCR..."): | |
| ocr_text = pytesseract.image_to_string(image, lang="chi_sim+eng") | |
| st.markdown("#### ๐ Extracted Text:") | |
| st.code(ocr_text.strip()) | |
| translated, label, score, reason = classify_emoji_text(ocr_text.strip()) | |
| st.markdown("### ๐ Translated sentence:") | |
| st.code(translated, language="text") | |
| st.markdown(f"### ๐ฏ Prediction: `{label}`") | |
| st.markdown(f"### ๐ Confidence Score: `{score:.2%}`") | |
| st.markdown("### ๐ง Model Explanation:") | |
| st.info(reason) | |
| elif section == "๐ Text Analysis": | |
| st.title("๐ Violation Analysis Dashboard") | |
| if history: | |
| df = pd.DataFrame(history) | |
| label_counts = df["label"].value_counts().reset_index() | |
| label_counts.columns = ["Category", "Count"] | |
| fig = px.pie(label_counts, names="Category", values="Count", title="Offensive Category Distribution", color_discrete_sequence=px.colors.sequential.RdBu) | |
| st.plotly_chart(fig) | |
| st.markdown("### ๐งพ Offensive Terms & Suggestions") | |
| for item in history: | |
| st.markdown(f"- ๐น **Input:** `{item['text']}`") | |
| st.markdown(f" - โจ **Translated:** `{item['translated']}`") | |
| st.markdown(f" - โ **Label:** `{item['label']}` with **{item['score']:.2%}** confidence") | |
| st.markdown(f" - ๐ง **Suggestion:** {item['reason']}") | |
| else: | |
| st.info("โ ๏ธ No classification data available yet.") | |
| elif section == "๐ ๏ธ Agent Build": | |
| st.title("๐ ๏ธ Agent Assistant for Text Classification") | |
| st.markdown("Upload context files and interact with an assistant to guide text moderation.") | |
| if uploaded_reference is not None: | |
| content = uploaded_reference.read().decode("utf-8") | |
| st.text_area("๐ Uploaded Reference Preview:", content, height=300) | |
| prompt = st.text_area("๐ฌ Ask the Assistant Anything:", "How can I improve detection on emotional slang?") | |
| if st.button("๐ก Analyze with Agent"): | |
| st.info("(This is a placeholder for future integration with a fine-tuned LLM or API call.)") | |