{"metadata":{"kernelspec":{"language":"python","display_name":"Python 3","name":"python3"},"language_info":{"name":"python","version":"3.11.13","mimetype":"text/x-python","codemirror_mode":{"name":"ipython","version":3},"pygments_lexer":"ipython3","nbconvert_exporter":"python","file_extension":".py"},"kaggle":{"accelerator":"none","dataSources":[],"dockerImageVersionId":31153,"isInternetEnabled":true,"language":"python","sourceType":"notebook","isGpuEnabled":false}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"code","source":"pip install gradio transformers torch sentence-transformers","metadata":{"_uuid":"8f2839f25d086af736a60e9eeb907d3b93b6e0e5","_cell_guid":"b1076dfc-b9ad-4769-8c92-a6c4dae69d19","trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"import gradio as gr\nimport torch\nfrom transformers import (\n AutoTokenizer, \n AutoModelForSeq2SeqLM,\n T5ForConditionalGeneration,\n T5Tokenizer\n)\nfrom sentence_transformers import SentenceTransformer, util\nimport numpy as np\nfrom typing import List, Tuple, Dict\nimport re\nimport numpy as np\nimport difflib","metadata":{"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"# Initialize similarity model\nsimilarity_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')","metadata":{"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"PARAPHRASE_MODELS = {\n \"T5-Base\": \"Vamsi/T5_Paraphrase_Paws\",\n \"PEGASUS-Paraphrase\": \"tuner007/pegasus_paraphrase\",\n \"Parrot-Paraphraser\": \"prithivida/parrot_paraphraser_on_T5\",\n \"BART-Paraphrase\": \"eugenesiow/bart-paraphrase\",\n \"ChatGPT-Style-T5\": \"humarin/chatgpt_paraphraser_on_T5_base\",\n}","metadata":{"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"EXPANSION_MODELS = {\n \"Flan-T5-Base\": \"google/flan-t5-base\",\n \"Flan-T5-Large\": \"google/flan-t5-large\",\n}","metadata":{"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"# Cache for loaded models\nmodel_cache = {}","metadata":{"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"def load_model(model_name: str, model_path: str):\n \"\"\"Load model and tokenizer with caching\"\"\"\n if model_name in model_cache:\n return model_cache[model_name]\n \n print(f\"Loading {model_name}...\")\n tokenizer = AutoTokenizer.from_pretrained(model_path)\n model = AutoModelForSeq2SeqLM.from_pretrained(model_path)\n \n # Move to GPU if available\n device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n model = model.to(device)\n \n model_cache[model_name] = (model, tokenizer, device)\n return model, tokenizer, device","metadata":{"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"def chunk_text(text: str, max_sentences: int = 4) -> List[str]:\n \"\"\"Split text into chunks based on number of sentences\"\"\"\n import re\n sentences = re.split(r'(?<=[.!?]) +', text.strip())\n chunks = [' '.join(sentences[i:i+max_sentences]) for i in range(0, len(sentences), max_sentences)]\n return [chunk for chunk in chunks if chunk.strip()]\n\ndef estimate_tokens(text: str) -> int:\n \"\"\"Estimate number of tokens in text (approximate: 1 token ≈ 0.75 words)\"\"\"\n word_count = len(text.split())\n return int(word_count / 0.75)\n\ndef calculate_max_length(input_text: str, mode: str, base_max_length: int) -> int:\n \"\"\"Calculate appropriate max_length based on input tokens\"\"\"\n input_tokens = estimate_tokens(input_text)\n \n if mode == \"Paraphrase\":\n # For paraphrasing: output should be 1.2-1.5x input tokens\n calculated_max = int(input_tokens * 1.5) + 50\n else:\n # For expansion: output should be 2-3x input tokens\n calculated_max = int(input_tokens * 3) + 100\n \n # Use the larger of calculated or user-specified max_length\n final_max_length = max(calculated_max, base_max_length)\n \n # Cap at reasonable maximum to avoid memory issues\n return min(final_max_length, 1024)\n","metadata":{"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"# def chunk_text(text: str, max_tokens: int = 300) -> List[str]:\n# \"\"\"Split text into chunks based on sentences to avoid exceeding token limits\"\"\"\n# sentences = re.split(r'(?<=[.!?])\\s+', text)\n# chunks = []\n# current_chunk = []\n# current_length = 0\n \n# for sentence in sentences:\n# sentence_length = len(sentence.split())\n# if current_length + sentence_length > max_tokens and current_chunk:\n# chunks.append(' '.join(current_chunk))\n# current_chunk = [sentence]\n# current_length = sentence_length\n# else:\n# current_chunk.append(sentence)\n# current_length += sentence_length\n \n# if current_chunk:\n# chunks.append(' '.join(current_chunk))\n \n# return chunks if chunks else [text]","metadata":{"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"def calculate_similarity(text1: str, text2: str) -> float:\n \"\"\"Calculate cosine similarity between two texts\"\"\"\n embeddings = similarity_model.encode([text1, text2], convert_to_tensor=True)\n similarity = util.cos_sim(embeddings[0], embeddings[1])\n return similarity.item()","metadata":{"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"def highlight_differences(original: str, generated: str) -> Tuple[str, str, Dict]:\n \"\"\"\n Create highlighted HTML versions of both texts showing differences\n Returns: (highlighted_original, highlighted_generated, statistics)\n \"\"\"\n # Split into words for comparison\n original_words = original.split()\n generated_words = generated.split()\n \n # Use difflib to find differences\n diff = difflib.SequenceMatcher(None, original_words, generated_words)\n \n highlighted_original = []\n highlighted_generated = []\n \n changes_count = 0\n additions_count = 0\n deletions_count = 0\n unchanged_count = 0\n word_substitutions = []\n \n for tag, i1, i2, j1, j2 in diff.get_opcodes():\n original_chunk = ' '.join(original_words[i1:i2])\n generated_chunk = ' '.join(generated_words[j1:j2])\n \n if tag == 'equal':\n # Unchanged text\n highlighted_original.append(original_chunk)\n highlighted_generated.append(generated_chunk)\n unchanged_count += (i2 - i1)\n \n elif tag == 'replace':\n # Changed text\n highlighted_original.append(f'{original_chunk}')\n highlighted_generated.append(f'{generated_chunk}')\n changes_count += max(i2 - i1, j2 - j1)\n \n # Track word substitutions (limit to single word changes for clarity)\n if i2 - i1 == 1 and j2 - j1 == 1:\n word_substitutions.append((original_chunk, generated_chunk))\n \n elif tag == 'delete':\n # Text removed in generated\n highlighted_original.append(f'{original_chunk}')\n deletions_count += (i2 - i1)\n \n elif tag == 'insert':\n # Text added in generated\n highlighted_generated.append(f'{generated_chunk}')\n additions_count += (j2 - j1)\n \n # Join with spaces\n final_original = ' '.join(highlighted_original)\n final_generated = ' '.join(highlighted_generated)\n \n # Calculate statistics\n total_original_words = len(original_words)\n total_generated_words = len(generated_words)\n \n percentage_changed = (changes_count + deletions_count + additions_count) / max(total_original_words, 1) * 100\n percentage_unchanged = (unchanged_count / max(total_original_words, 1)) * 100\n \n statistics = {\n 'total_original': total_original_words,\n 'total_generated': total_generated_words,\n 'unchanged': unchanged_count,\n 'changed': changes_count,\n 'added': additions_count,\n 'deleted': deletions_count,\n 'percentage_changed': percentage_changed,\n 'percentage_unchanged': percentage_unchanged,\n 'substitutions': word_substitutions[:10] # Limit to first 10\n }\n \n return final_original, final_generated, statistics\n","metadata":{"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"def format_statistics(stats: Dict) -> str:\n \"\"\"Format statistics into a readable HTML string with dark theme\"\"\"\n html = f\"\"\"\n