Spaces:

jenniferhk008
/

Hatesocial

Sleeping

App Files Files Community

Hatesocial / app.py

jenniferhk008

Update app.py

d9ba741 verified 7 months ago

raw

history blame

6.4 kB

	from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
	import torch
	import streamlit as st
	from PIL import Image
	import pytesseract
	import openai
	import pandas as pd
	import plotly.express as px

	# ✅ Step 1: Emoji 翻译模型（你自己训练的模型）
	emoji_model_id = "JenniferHJF/qwen1.5-emoji-finetuned"
	emoji_tokenizer = AutoTokenizer.from_pretrained(emoji_model_id, trust_remote_code=True)
	emoji_model = AutoModelForCausalLM.from_pretrained(
	emoji_model_id,
	trust_remote_code=True,
	torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
	).to("cuda" if torch.cuda.is_available() else "cpu")
	emoji_model.eval()

	# ✅ Step 2: 可选择的冒犯性文本识别模型
	model_options = {
	"Toxic-BERT": "unitary/toxic-bert",
	"Roberta Offensive": "cardiffnlp/twitter-roberta-base-offensive",
	"BERT Emotion": "bhadresh-savani/bert-base-go-emotion"
	}

	# ✅ 页面配置
	st.set_page_config(page_title="Emoji Offensive Text Detector", page_icon="🚨", layout="wide")

	# ✅ 页面布局
	with st.sidebar:
	st.header("🧠 Navigation")
	section = st.radio("Select Mode:", ["📍 Text Moderation", "📊 Text Analysis", "🛠️ Agent Build"])

	if section == "📍 Text Moderation":
	moderation_type = st.selectbox("Select Task Type", ["Normal Text", "Bullet Screen Text"])
	selected_model = st.selectbox("Choose classification model", list(model_options.keys()))
	selected_model_id = model_options[selected_model]
	classifier = pipeline("text-classification", model=selected_model_id, device=0 if torch.cuda.is_available() else -1)

	elif section == "📊 Text Analysis":
	st.markdown("You can view the violation distribution chart and editing suggestions.")

	elif section == "🛠️ Agent Build":
	st.markdown("Upload supporting files for in-context fine-tuning.")
	uploaded_reference = st.file_uploader("Upload fine-tuning reference file", type=["txt", "csv"])

	# ✅ 存储分析历史记录
	history = []

	def classify_emoji_text(text: str):
	prompt = f"输入：{text}\n输出："
	input_ids = emoji_tokenizer(prompt, return_tensors="pt").to(emoji_model.device)
	with torch.no_grad():
	output_ids = emoji_model.generate(**input_ids, max_new_tokens=64, do_sample=False)
	decoded = emoji_tokenizer.decode(output_ids[0], skip_special_tokens=True)
	translated_text = decoded.split("输出：")[-1].strip() if "输出：" in decoded else decoded.strip()

	result = classifier(translated_text)[0]
	label = result["label"]
	score = result["score"]
	reasoning = f"The sentence was flagged as '{label}' due to potentially offensive phrases. Consider replacing emotionally charged, ambiguous, or abusive terms."

	history.append({"text": text, "translated": translated_text, "label": label, "score": score, "reason": reasoning})

	return translated_text, label, score, reasoning

	# ✅ Section logic
	if section == "📍 Text Moderation":
	st.title("📍 Offensive Text Classification")
	st.markdown("### ✍️ Input your sentence:")
	default_text = "你是🐷"
	text = st.text_area("Enter sentence with emojis:", value=default_text, height=150)

	if st.button("🚦 Analyze"):
	with st.spinner("🔍 Processing..."):
	try:
	translated, label, score, reason = classify_emoji_text(text)
	st.markdown("### 🔄 Translated sentence:")
	st.code(translated, language="text")

	st.markdown(f"### 🎯 Prediction: `{label}`")
	st.markdown(f"### 📊 Confidence Score: `{score:.2%}`")
	st.markdown(f"### 🧠 Model Explanation:")
	st.info(reason)

	except Exception as e:
	st.error(f"❌ An error occurred during processing:\n\n{e}")

	st.markdown("---")
	st.markdown("### 🖼️ Or upload a screenshot of bullet comments:")

	uploaded_file = st.file_uploader("Upload an image (JPG/PNG)", type=["jpg", "jpeg", "png"])

	if uploaded_file is not None:
	image = Image.open(uploaded_file)
	st.image(image, caption="Uploaded Screenshot", use_column_width=True)

	with st.spinner("🧠 Extracting text via OCR..."):
	ocr_text = pytesseract.image_to_string(image, lang="chi_sim+eng")
	st.markdown("#### 📋 Extracted Text:")
	st.code(ocr_text.strip())

	translated, label, score, reason = classify_emoji_text(ocr_text.strip())
	st.markdown("### 🔄 Translated sentence:")
	st.code(translated, language="text")

	st.markdown(f"### 🎯 Prediction: `{label}`")
	st.markdown(f"### 📊 Confidence Score: `{score:.2%}`")
	st.markdown("### 🧠 Model Explanation:")
	st.info(reason)

	elif section == "📊 Text Analysis":
	st.title("📊 Violation Analysis Dashboard")
	if history:
	df = pd.DataFrame(history)
	label_counts = df["label"].value_counts().reset_index()
	label_counts.columns = ["Category", "Count"]
	fig = px.pie(label_counts, names="Category", values="Count", title="Offensive Category Distribution", color_discrete_sequence=px.colors.sequential.RdBu)
	st.plotly_chart(fig)

	st.markdown("### 🧾 Offensive Terms & Suggestions")
	for item in history:
	st.markdown(f"- 🔹 Input: `{item['text']}`")
	st.markdown(f" - ✨ Translated: `{item['translated']}`")
	st.markdown(f" - ❗ Label: `{item['label']}` with {item['score']:.2%} confidence")
	st.markdown(f" - 🔧 Suggestion: {item['reason']}")
	else:
	st.info("⚠️ No classification data available yet.")

	elif section == "🛠️ Agent Build":
	st.title("🛠️ Agent Assistant for Text Classification")
	st.markdown("Upload context files and interact with an assistant to guide text moderation.")

	if uploaded_reference is not None:
	content = uploaded_reference.read().decode("utf-8")
	st.text_area("📄 Uploaded Reference Preview:", content, height=300)

	prompt = st.text_area("💬 Ask the Assistant Anything:", "How can I improve detection on emotional slang?")

	if st.button("💡 Analyze with Agent"):
	st.info("(This is a placeholder for future integration with a fine-tuned LLM or API call.)")