{"metadata":{"kernelspec":{"language":"python","display_name":"Python 3","name":"python3"},"language_info":{"name":"python","version":"3.11.13","mimetype":"text/x-python","codemirror_mode":{"name":"ipython","version":3},"pygments_lexer":"ipython3","nbconvert_exporter":"python","file_extension":".py"},"kaggle":{"accelerator":"none","dataSources":[],"dockerImageVersionId":31153,"isInternetEnabled":true,"language":"python","sourceType":"notebook","isGpuEnabled":false}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"code","source":"pip install gradio transformers torch sentence-transformers","metadata":{"_uuid":"8f2839f25d086af736a60e9eeb907d3b93b6e0e5","_cell_guid":"b1076dfc-b9ad-4769-8c92-a6c4dae69d19","trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"import gradio as gr\nimport torch\nfrom transformers import (\n AutoTokenizer, \n AutoModelForSeq2SeqLM,\n T5ForConditionalGeneration,\n T5Tokenizer\n)\nfrom sentence_transformers import SentenceTransformer, util\nimport numpy as np\nfrom typing import List, Tuple, Dict\nimport re\nimport numpy as np\nimport difflib","metadata":{"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"# Initialize similarity model\nsimilarity_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')","metadata":{"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"PARAPHRASE_MODELS = {\n \"T5-Base\": \"Vamsi/T5_Paraphrase_Paws\",\n \"PEGASUS-Paraphrase\": \"tuner007/pegasus_paraphrase\",\n \"Parrot-Paraphraser\": \"prithivida/parrot_paraphraser_on_T5\",\n \"BART-Paraphrase\": \"eugenesiow/bart-paraphrase\",\n \"ChatGPT-Style-T5\": \"humarin/chatgpt_paraphraser_on_T5_base\",\n}","metadata":{"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"EXPANSION_MODELS = {\n \"Flan-T5-Base\": \"google/flan-t5-base\",\n \"Flan-T5-Large\": \"google/flan-t5-large\",\n}","metadata":{"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"# Cache for loaded models\nmodel_cache = {}","metadata":{"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"def load_model(model_name: str, model_path: str):\n \"\"\"Load model and tokenizer with caching\"\"\"\n if model_name in model_cache:\n return model_cache[model_name]\n \n print(f\"Loading {model_name}...\")\n tokenizer = AutoTokenizer.from_pretrained(model_path)\n model = AutoModelForSeq2SeqLM.from_pretrained(model_path)\n \n # Move to GPU if available\n device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n model = model.to(device)\n \n model_cache[model_name] = (model, tokenizer, device)\n return model, tokenizer, device","metadata":{"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"def chunk_text(text: str, max_sentences: int = 4) -> List[str]:\n \"\"\"Split text into chunks based on number of sentences\"\"\"\n import re\n sentences = re.split(r'(?<=[.!?]) +', text.strip())\n chunks = [' '.join(sentences[i:i+max_sentences]) for i in range(0, len(sentences), max_sentences)]\n return [chunk for chunk in chunks if chunk.strip()]\n\ndef estimate_tokens(text: str) -> int:\n \"\"\"Estimate number of tokens in text (approximate: 1 token ≈ 0.75 words)\"\"\"\n word_count = len(text.split())\n return int(word_count / 0.75)\n\ndef calculate_max_length(input_text: str, mode: str, base_max_length: int) -> int:\n \"\"\"Calculate appropriate max_length based on input tokens\"\"\"\n input_tokens = estimate_tokens(input_text)\n \n if mode == \"Paraphrase\":\n # For paraphrasing: output should be 1.2-1.5x input tokens\n calculated_max = int(input_tokens * 1.5) + 50\n else:\n # For expansion: output should be 2-3x input tokens\n calculated_max = int(input_tokens * 3) + 100\n \n # Use the larger of calculated or user-specified max_length\n final_max_length = max(calculated_max, base_max_length)\n \n # Cap at reasonable maximum to avoid memory issues\n return min(final_max_length, 1024)\n","metadata":{"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"# def chunk_text(text: str, max_tokens: int = 300) -> List[str]:\n# \"\"\"Split text into chunks based on sentences to avoid exceeding token limits\"\"\"\n# sentences = re.split(r'(?<=[.!?])\\s+', text)\n# chunks = []\n# current_chunk = []\n# current_length = 0\n \n# for sentence in sentences:\n# sentence_length = len(sentence.split())\n# if current_length + sentence_length > max_tokens and current_chunk:\n# chunks.append(' '.join(current_chunk))\n# current_chunk = [sentence]\n# current_length = sentence_length\n# else:\n# current_chunk.append(sentence)\n# current_length += sentence_length\n \n# if current_chunk:\n# chunks.append(' '.join(current_chunk))\n \n# return chunks if chunks else [text]","metadata":{"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"def calculate_similarity(text1: str, text2: str) -> float:\n \"\"\"Calculate cosine similarity between two texts\"\"\"\n embeddings = similarity_model.encode([text1, text2], convert_to_tensor=True)\n similarity = util.cos_sim(embeddings[0], embeddings[1])\n return similarity.item()","metadata":{"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"def highlight_differences(original: str, generated: str) -> Tuple[str, str, Dict]:\n \"\"\"\n Create highlighted HTML versions of both texts showing differences\n Returns: (highlighted_original, highlighted_generated, statistics)\n \"\"\"\n # Split into words for comparison\n original_words = original.split()\n generated_words = generated.split()\n \n # Use difflib to find differences\n diff = difflib.SequenceMatcher(None, original_words, generated_words)\n \n highlighted_original = []\n highlighted_generated = []\n \n changes_count = 0\n additions_count = 0\n deletions_count = 0\n unchanged_count = 0\n word_substitutions = []\n \n for tag, i1, i2, j1, j2 in diff.get_opcodes():\n original_chunk = ' '.join(original_words[i1:i2])\n generated_chunk = ' '.join(generated_words[j1:j2])\n \n if tag == 'equal':\n # Unchanged text\n highlighted_original.append(original_chunk)\n highlighted_generated.append(generated_chunk)\n unchanged_count += (i2 - i1)\n \n elif tag == 'replace':\n # Changed text\n highlighted_original.append(f'{original_chunk}')\n highlighted_generated.append(f'{generated_chunk}')\n changes_count += max(i2 - i1, j2 - j1)\n \n # Track word substitutions (limit to single word changes for clarity)\n if i2 - i1 == 1 and j2 - j1 == 1:\n word_substitutions.append((original_chunk, generated_chunk))\n \n elif tag == 'delete':\n # Text removed in generated\n highlighted_original.append(f'{original_chunk}')\n deletions_count += (i2 - i1)\n \n elif tag == 'insert':\n # Text added in generated\n highlighted_generated.append(f'{generated_chunk}')\n additions_count += (j2 - j1)\n \n # Join with spaces\n final_original = ' '.join(highlighted_original)\n final_generated = ' '.join(highlighted_generated)\n \n # Calculate statistics\n total_original_words = len(original_words)\n total_generated_words = len(generated_words)\n \n percentage_changed = (changes_count + deletions_count + additions_count) / max(total_original_words, 1) * 100\n percentage_unchanged = (unchanged_count / max(total_original_words, 1)) * 100\n \n statistics = {\n 'total_original': total_original_words,\n 'total_generated': total_generated_words,\n 'unchanged': unchanged_count,\n 'changed': changes_count,\n 'added': additions_count,\n 'deleted': deletions_count,\n 'percentage_changed': percentage_changed,\n 'percentage_unchanged': percentage_unchanged,\n 'substitutions': word_substitutions[:10] # Limit to first 10\n }\n \n return final_original, final_generated, statistics\n","metadata":{"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"def format_statistics(stats: Dict) -> str:\n \"\"\"Format statistics into a readable HTML string with dark theme\"\"\"\n html = f\"\"\"\n
\n

📊 Change Analysis

\n \n
\n
\n
{stats['total_original']}
\n
Original Words
\n
\n
\n
{stats['total_generated']}
\n
Generated Words
\n
\n
\n
{stats['unchanged']}
\n
Unchanged
\n
\n
\n
{stats['changed']}
\n
Changed
\n
\n
\n \n
\n
\n Modification Rate: {stats['percentage_changed']:.1f}% modified, {stats['percentage_unchanged']:.1f}% preserved\n
\n
\n ✚ Added: {stats['added']} words | \n ✖ Removed: {stats['deleted']} words\n
\n
\n \"\"\"\n \n if stats['substitutions']:\n html += \"\"\"\n
\n 🔄 Sample Word Substitutions:\n
\n \"\"\"\n for orig, new in stats['substitutions']:\n html += f'
{orig}{new}
'\n html += \"\"\"\n
\n
\n \"\"\"\n \n html += \"\"\"\n
\n Legend: \n Removed/Changed\n Added/New\n
\n
\n \"\"\"\n \n return html\n","metadata":{"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"def paraphrase_text(\n text: str,\n model_name: str,\n temperature: float,\n top_p: float,\n max_length: int,\n num_beams: int,\n max_sentences: int,\n target_words: int = None,\n mode: str = \"Paraphrase\"\n) -> Tuple[str, float]:\n \"\"\"Paraphrase or expand text based on mode\"\"\"\n \n if not text.strip():\n return \"Please enter some text to process.\", 0.0\n \n # Select appropriate model based on mode\n if mode == \"Paraphrase\":\n models_dict = PARAPHRASE_MODELS\n if model_name not in models_dict:\n model_name = list(models_dict.keys())[0]\n model_path = models_dict[model_name]\n prefix = \"paraphrase: \" if \"T5\" in model_name else \"\"\n else: # Expand mode\n models_dict = EXPANSION_MODELS\n if model_name not in models_dict:\n model_name = list(models_dict.keys())[0]\n model_path = models_dict[model_name]\n target_words = target_words or 300\n prefix = f\"Expand the following text to approximately {target_words} words, adding more details and context: \"\n \n # Load model\n model, tokenizer, device = load_model(model_name, model_path)\n \n # Chunk text based on sentences\n chunks = chunk_text(text, max_sentences=max_sentences)\n \n processed_chunks = []\n \n print(f\"\\n{'='*60}\")\n print(f\"Processing {len(chunks)} chunk(s) with {max_sentences} sentences per chunk\")\n print(f\"{'='*60}\")\n \n for i, chunk in enumerate(chunks):\n # Calculate dynamic max_length for this chunk\n chunk_max_length = calculate_max_length(chunk, mode, max_length)\n input_tokens = estimate_tokens(chunk)\n \n # Prepare input\n input_text = prefix + chunk + \" \" if mode == \"Paraphrase\" else prefix + chunk\n inputs = tokenizer.encode(\n input_text, \n return_tensors=\"pt\", \n max_length=512, \n truncation=True\n )\n inputs = inputs.to(device)\n \n # Calculate min_length to ensure output isn't too short\n if mode == \"Paraphrase\":\n min_length_calc = int(input_tokens * 0.8)\n else:\n min_length_calc = int(input_tokens * 1.5)\n \n # Generate\n with torch.no_grad():\n outputs = model.generate(\n inputs,\n max_length=chunk_max_length,\n min_length=min(min_length_calc, chunk_max_length - 10),\n num_beams=num_beams,\n temperature=temperature if temperature > 0 else 1.0,\n top_p=top_p,\n top_k=120 if mode == \"Paraphrase\" else 50,\n do_sample=temperature > 0,\n early_stopping=True,\n no_repeat_ngram_size=3 if mode == \"Expand\" else 2,\n length_penalty=1.0 if mode == \"Paraphrase\" else 1.5,\n repetition_penalty=1.2,\n )\n \n # Decode output\n processed_text = tokenizer.decode(outputs[0], skip_special_tokens=True)\n processed_chunks.append(processed_text.strip())\n \n output_tokens = estimate_tokens(processed_text)\n print(f\"Chunk {i+1}/{len(chunks)}:\")\n print(f\" Input: {len(chunk.split())} words (~{input_tokens} tokens)\")\n print(f\" Output: {len(processed_text.split())} words (~{output_tokens} tokens)\")\n print(f\" Max length used: {chunk_max_length}\")\n print(f\"-\" * 60)\n \n # Combine chunks with double newline\n final_text = \"\\n\\n\".join(processed_chunks)\n \n # Calculate similarity\n similarity_score = calculate_similarity(text, final_text)\n \n print(f\"{'='*60}\")\n print(f\"Total: {len(text.split())} → {len(final_text.split())} words\")\n print(f\"Similarity: {similarity_score:.4f}\")\n print(f\"{'='*60}\\n\")\n \n return final_text, similarity_score","metadata":{"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"def update_model_choices(mode: str):\n \"\"\"Update model dropdown based on selected mode\"\"\"\n if mode == \"Paraphrase\":\n choices = list(PARAPHRASE_MODELS.keys())\n else:\n choices = list(EXPANSION_MODELS.keys())\n return gr.Dropdown(choices=choices, value=choices[0])\n\ndef update_parameters_visibility(mode: str):\n \"\"\"Show/hide target words parameter based on mode\"\"\"\n if mode == \"Expand\":\n return gr.Number(visible=True)\n else:\n return gr.Number(visible=False)","metadata":{"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"def process_text(\n input_text: str,\n mode: str,\n model_name: str,\n temperature: float,\n top_p: float,\n max_length: int,\n num_beams: int,\n max_sentences: int,\n target_words: int\n):\n \"\"\"Main processing function\"\"\"\n try:\n output_text, similarity = paraphrase_text(\n input_text,\n model_name,\n temperature,\n top_p,\n max_length,\n num_beams,\n max_sentences,\n target_words,\n mode\n )\n \n word_count_original = len(input_text.split())\n word_count_output = len(output_text.split())\n \n # Generate highlighted comparison\n highlighted_original, highlighted_generated, statistics = highlight_differences(\n input_text, \n output_text\n )\n \n # Format statistics\n stats_html = format_statistics(statistics)\n \n # Basic stats line\n basic_stats = f\"**Original:** {word_count_original} words | **Generated:** {word_count_output} words | **Similarity:** {similarity:.4f}\"\n \n return output_text, basic_stats, similarity, highlighted_original, highlighted_generated, stats_html\n except Exception as e:\n import traceback\n error_msg = f\"Error: {str(e)}\\n\\n{traceback.format_exc()}\"\n print(error_msg)\n return error_msg, \"Error occurred\", 0.0, \"\", \"\", \"\"\n","metadata":{"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"# Create Gradio interface\nwith gr.Blocks(title=\"Text Paraphraser & Expander\", theme=gr.themes.Soft()) as demo:\n gr.Markdown(\n \"\"\"\n # 📝 Text Paraphraser & Expander\n Transform your text with AI-powered paraphrasing and expansion capabilities.\n \"\"\"\n )\n \n with gr.Row():\n with gr.Column(scale=1):\n mode = gr.Radio(\n choices=[\"Paraphrase\", \"Expand\"],\n value=\"Paraphrase\",\n label=\"Mode\",\n info=\"Choose to paraphrase or expand your text\"\n )\n \n model_dropdown = gr.Dropdown(\n choices=list(PARAPHRASE_MODELS.keys()),\n value=list(PARAPHRASE_MODELS.keys())[0],\n label=\"Model Selection\",\n info=\"Choose the model for processing\"\n )\n \n gr.Markdown(\"### ⚙️ Parameters\")\n \n temperature = gr.Slider(\n minimum=0.0,\n maximum=2.0,\n value=0.7,\n step=0.1,\n label=\"Temperature\",\n info=\"Higher = more creative, Lower = more focused\"\n )\n \n top_p = gr.Slider(\n minimum=0.1,\n maximum=1.0,\n value=0.9,\n step=0.05,\n label=\"Top-p (Nucleus Sampling)\",\n info=\"Probability threshold for token selection\"\n )\n \n max_length = gr.Slider(\n minimum=128,\n maximum=1024,\n value=512,\n step=32,\n label=\"Max Length (tokens)\",\n info=\"Maximum length of generated text per chunk\"\n )\n \n num_beams = gr.Slider(\n minimum=1,\n maximum=10,\n value=4,\n step=1,\n label=\"Number of Beams\",\n info=\"Higher = better quality but slower\"\n )\n \n max_sentences = gr.Slider(\n minimum=1,\n maximum=10,\n value=4,\n step=1,\n label=\"Sentences per Chunk\",\n info=\"Number of sentences to process together\"\n )\n \n target_words = gr.Number(\n value=300,\n label=\"Target Word Count (Expand mode)\",\n info=\"Approximate number of words for expansion\",\n visible=False\n )\n \n with gr.Row():\n with gr.Column(scale=1):\n gr.Markdown(\"### 📥 Input Text\")\n input_text = gr.Textbox(\n lines=10,\n placeholder=\"Enter your text here...\",\n label=\"Original Text\",\n show_copy_button=True\n )\n \n with gr.Column(scale=1):\n gr.Markdown(\"### 📤 Generated Text\")\n output_text = gr.Textbox(\n lines=10,\n label=\"Processed Text\",\n show_copy_button=True\n )\n \n with gr.Row():\n process_btn = gr.Button(\"🚀 Generate\", variant=\"primary\", size=\"lg\")\n clear_btn = gr.ClearButton([input_text, output_text], value=\"🗑️ Clear\")\n \n stats_display = gr.Markdown()\n \n similarity_display = gr.Number(\n label=\"Cosine Similarity Score\",\n precision=4,\n interactive=False\n )\n \n # Highlighted comparison section\n gr.Markdown(\"---\")\n gr.Markdown(\"## 🔍 Visual Comparison - See What Changed\")\n \n with gr.Row():\n with gr.Column(scale=1):\n gr.Markdown(\"### 📄 Original Text (with changes highlighted)\")\n highlighted_original = gr.HTML(\n label=\"Original with Changes\",\n show_label=False\n )\n \n with gr.Column(scale=1):\n gr.Markdown(\"### ✨ Generated Text (with changes highlighted)\")\n highlighted_generated = gr.HTML(\n label=\"Generated with Changes\",\n show_label=False\n )\n\n change_stats = gr.HTML(label=\"Change Statistics\")\n \n # Event handlers\n mode.change(\n fn=update_model_choices,\n inputs=[mode],\n outputs=[model_dropdown]\n )\n \n mode.change(\n fn=update_parameters_visibility,\n inputs=[mode],\n outputs=[target_words]\n )\n \n process_btn.click(\n fn=process_text,\n inputs=[\n input_text,\n mode,\n model_dropdown,\n temperature,\n top_p,\n max_length,\n num_beams,\n max_sentences,\n target_words\n ],\n outputs=[\n output_text, \n stats_display, \n similarity_display,\n highlighted_original,\n highlighted_generated,\n change_stats\n ]\n )\n \n gr.Markdown(\n \"\"\"\n ---\n ### 💡 Tips:\n - **Paraphrase Mode**: Rewrites text while preserving meaning\n - **Expand Mode**: Adds details and elaboration to make text longer\n - **Sentences per Chunk**: Controls how many sentences are processed together (4 recommended)\n - Adjust temperature for creativity (0.7-1.0 for paraphrase, 1.0-1.5 for expansion)\n - Higher beam count = better quality but slower processing\n - Max length is automatically calculated based on input, but can be overridden\n - Output chunks are separated by double newlines for readability\n \"\"\"\n )","metadata":{"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"if __name__ == \"__main__\":\n demo.launch(share=True)","metadata":{"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"","metadata":{"trusted":true},"outputs":[],"execution_count":null}]}