Spaces:

enoch10jason
/

grammar-correction-api

Build error

App Files Files Community

Enoch Jason J commited on Oct 6

Commit

9fca407

0 Parent(s):

Add application file

Browse files

Files changed (7) hide show

.gitignore +67 -0
Dockerfile +35 -0
document_pipeline.py +133 -0
download_models.py +39 -0
main.py +126 -0
requirements.txt +8 -0
requirements_local.txt +8 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,67 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.pyc
+*.pyd
+*.pyo
+*.dll
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+.coverage
+.tox/
+htmlcov/
+.pytest_cache/
+# Editors
+.vscode/
+.idea/
+# OS
+.DS_Store
+.Trashes
+Thumbs.db
+# Virtual environment
+.venv/
+venv/
+env/
+# Jupyter Notebook
+.ipynb_checkpoints
+# MyPy
+.mypy_cache/
+# PyInstaller
+*.spec
+build/
+dist/
+# Temporary files
+*.tmp
+*.bak
+*.swp
+*~

Dockerfile ADDED Viewed

	@@ -0,0 +1,35 @@

+# Use a standard Python slim image for a lightweight CPU environment
+FROM python:3.11-slim
+# Set the working directory
+WORKDIR /app
+# --- Installation ---
+# Copy the local requirements file and the model downloader script
+COPY requirements_local.txt .
+COPY download_models.py .
+# Install Python dependencies
+RUN pip install --no-cache-dir -r requirements_local.txt
+# --- Pre-download and Cache Models during the Build Process ---
+# This makes the container startup fast and reliable.
+# The token is passed securely as a build argument and is not saved in the final image.
+ARG HUGGING_FACE_HUB_TOKEN
+RUN --mount=type=cache,target=/root/.cache/huggingface \
+    HUGGING_FACE_HUB_TOKEN=${HUGGING_FACE_HUB_TOKEN} python download_models.py
+# Copy the main application code
+COPY main.py .
+# IMPORTANT: If your LoRA adapter is a local folder, you need to copy it in.
+# For example:
+# COPY ./my_local_lora_adapter /app/my_local_lora_adapter
+# Then, in main.py, set LORA_ADAPTER_PATH = "/app/my_local_lora_adapter"
+# Expose the port the app runs on
+EXPOSE 8000
+# Command to run the application
+CMD ["uvicorn", "main:app", "--host", "0._0.0.0", "--port", "8000"]

document_pipeline.py ADDED Viewed

	@@ -0,0 +1,133 @@

+import requests
+import re
+from fpdf import FPDF
+import os
+import textract
+# --- Configuration ---
+AI_SERVICE_URL = "http://localhost:8000"
+INPUT_DOC_PATH = "Doreen.doc"
+OUTPUT_PDF_PATH = "Doreen DeFio_Dr. Daniel Rich_Report_Generated.pdf"
+def correct_text_via_api(endpoint: str, text: str) -> str:
+    try:
+        response = requests.post(f"{AI_SERVICE_URL}/{endpoint}", json={"text": text})
+        response.raise_for_status()
+        return response.json()["corrected_text"]
+    except requests.exceptions.RequestException as e:
+        print(f"Error calling AI service at endpoint '{endpoint}': {e}")
+        return text
+def extract_text_from_doc(filepath):
+    if not os.path.exists(filepath):
+        raise FileNotFoundError(f"Input file not found at: {filepath}")
+    try:
+        text_bytes = textract.process(filepath)
+        return text_bytes.decode('utf-8')
+    except Exception as e:
+        print(f"Error reading document with textract: {e}")
+        return None
+def parse_and_correct_text(raw_text):
+    structured_data = {}
+    current_section = None
+    buffer = []
+    key_value_pattern = re.compile(
+        r'^\s*(Client Name|Date of Exam|Date of Accident|Examinee|Observed By|Performed By|Specialty|Facility|Facility Description|Appointment Scheduled For|Arrived at Office|Admitted to Exam Room|Intake Start|Exam Start|Exam End|Length of Exam|Total Length of Visit|Others Present|Description of IME physician|Layout of Exam Room|Did IME Physician Have Examinees Medical Records)\s*:\s*(.*)',
+        re.IGNORECASE | re.DOTALL
+    )
+    section_headers = ["Intake:", "Exam:"]
+    lines = [line.strip() for line in raw_text.split('\n') if line.strip()]
+    i = 0
+    while i < len(lines):
+        line = lines[i]
+        if line in section_headers:
+            if current_section and buffer:
+                full_paragraph = " ".join(buffer)
+                grammar_corrected = correct_text_via_api("correct_grammar", full_paragraph)
+                final_corrected = correct_text_via_api("correct_gender", grammar_corrected)
+                structured_data[current_section] = final_corrected
+            current_section = line.replace(":", "").strip()
+            buffer = []
+            i += 1
+            continue
+        match = key_value_pattern.match(line)
+        if match:
+            key, value = map(str.strip, match.groups())
+            if not value and (i + 1) < len(lines) and not key_value_pattern.match(lines[i+1]) and lines[i+1] not in section_headers:
+                value = lines[i+1]
+                i += 1
+            structured_data[key] = correct_text_via_api("correct_grammar", value)
+        elif current_section:
+            buffer.append(line)
+        i += 1
+    if current_section and buffer:
+        full_paragraph = " ".join(buffer)
+        grammar_corrected = correct_text_via_api("correct_grammar", full_paragraph)
+        final_corrected = correct_text_via_api("correct_gender", grammar_corrected)
+        structured_data[current_section] = final_corrected
+    return structured_data
+class PDF(FPDF):
+    def header(self):
+        self.set_font("DejaVu", "B", 15)
+        self.cell(0, 10, 'IME WatchDog Report', 0, 1, 'C')
+        self.ln(10)
+    def footer(self):
+        self.set_y(-15)
+        self.set_font("DejaVu", "I", 8)
+        self.cell(0, 10, f'Page {self.page_no()}', 0, 0, 'C')
+def generate_pdf(data, output_path):
+    pdf = PDF()
+    # --- FIX: Add a Unicode font that supports characters like ’ ---
+    # You may need to provide the path to the .ttf font file if not in a standard location.
+    # This example assumes it can be found.
+    try:
+        pdf.add_font("DejaVu", "", "DejaVuSans.ttf", uni=True)
+        pdf.add_font("DejaVu", "B", "DejaVuSans-Bold.ttf", uni=True)
+        pdf.add_font("DejaVu", "I", "DejaVuSans-Oblique.ttf", uni=True)
+    except RuntimeError:
+        print("---")
+        print("⚠️ FONT WARNING: DejaVuSans.ttf not found.")
+        print("The PDF will be generated, but may have character issues.")
+        print("Please download the DejaVu font family and place the .ttf files in this directory.")
+        print("---")
+    pdf.add_page()
+    pdf.set_font("DejaVu", "", 12)
+    key_order = [
+        "Client Name", "Date of Exam", "Date of Accident", "Examinee", "Observed By",
+        "Performed By", "Specialty", "Facility", "Facility Description",
+        "Appointment Scheduled For", "Arrived at Office", "Admitted to Exam Room",
+        "Intake Start", "Exam Start", "Exam End", "Length of Exam", "Total Length of Visit",
+        "Others Present", "Description of IME physician", "Layout of Exam Room",
+        "Did IME Physician Have Examinees Medical Records", "Intake", "Exam"
+    ]
+    for key in key_order:
+        if key in data:
+            value = data[key]
+            pdf.set_font("DejaVu", "B", 12)
+            pdf.cell(0, 10, f"{key}:", ln=True)
+            pdf.set_font("DejaVu", "", 12)
+            pdf.multi_cell(0, 8, str(value))
+            pdf.ln(4)
+    pdf.output(output_path)
+    print(f"✅ Successfully generated PDF report at: {output_path}")
+if __name__ == "__main__":
+    print("--- Starting Document Transformation Pipeline ---")
+    if os.path.exists(INPUT_DOC_PATH):
+        print(f"1. Extracting text from '{INPUT_DOC_PATH}' using textract...")
+        raw_document_text = extract_text_from_doc(INPUT_DOC_PATH)
+        if raw_document_text:
+            print("2. Parsing and correcting text via AI microservice...")
+            corrected_data = parse_and_correct_text(raw_document_text)
+            print(f"3. Generating PDF report '{OUTPUT_PDF_PATH}'...")
+            generate_pdf(corrected_data, OUTPUT_PDF_PATH)
+        print("--- Pipeline Finished ---")
+    else:
+        print(f"❌ ERROR: Input file not found: '{INPUT_DOC_PATH}'")

download_models.py ADDED Viewed

	@@ -0,0 +1,39 @@

+import os
+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM
+from peft import PeftModel
+# This script is run during the Docker build process to pre-download models.
+GENDER_MODEL_PATH = "google/gemma-3-270m-qat-q4_0-unquantized"
+BASE_MODEL_PATH = "unsloth/gemma-2b-it"
+LORA_ADAPTER_PATH = "unsloth/gemma-2b-it-lora-test"
+hf_token = os.getenv("HUGGING_FACE_HUB_TOKEN")
+if not hf_token:
+    raise ValueError("HUGGING_FACE_HUB_TOKEN environment variable is required to download models.")
+print("--- Starting Model Pre-downloading ---")
+# 1. Download Gender Model
+print(f"Downloading: {GENDER_MODEL_PATH}")
+AutoTokenizer.from_pretrained(GENDER_MODEL_PATH, token=hf_token)
+AutoModelForCausalLM.from_pretrained(GENDER_MODEL_PATH, token=hf_token)
+print("✅ Gender model downloaded.")
+# 2. Download Grammar Model (Base + Adapter)
+print(f"Downloading: {BASE_MODEL_PATH}")
+base_model = AutoModelForCausalLM.from_pretrained(
+    BASE_MODEL_PATH,
+    token=hf_token,
+    dtype=torch.float32,
+)
+AutoTokenizer.from_pretrained(BASE_MODEL_PATH, token=hf_token)
+print("✅ Base model downloaded.")
+print(f"Downloading: {LORA_ADAPTER_PATH}")
+PeftModel.from_pretrained(base_model, LORA_ADAPTER_PATH, token=hf_token)
+print("✅ LoRA adapter downloaded.")
+print("--- Model Pre-downloading Complete ---")

main.py ADDED Viewed

	@@ -0,0 +1,126 @@

+from fastapi import FastAPI, HTTPException
+from pydantic import BaseModel
+import torch
+import re
+import os
+# --- Import Libraries ---
+from transformers import AutoTokenizer, AutoModelForCausalLM
+from peft import PeftModel
+# --- Model Paths (will be loaded from local cache) ---
+GENDER_MODEL_PATH = "google/gemma-3-270m-qat-q4_0-unquantized"
+BASE_MODEL_PATH = "unsloth/gemma-2b-it"
+LORA_ADAPTER_PATH = "unsloth/gemma-2b-it-lora-test"
+# --- Global variables for models ---
+grammar_model = None
+grammar_tokenizer = None
+gender_model = None
+gender_tokenizer = None
+device = "cpu"
+print("--- Starting Model Loading ---")
+# The token is only used during the build, not at runtime.
+hf_token = os.getenv("HUGGING_FACE_HUB_TOKEN")
+try:
+    # Load models from the local cache inside the container. Startup is now fast.
+    print(f"Loading gender model from cache: {GENDER_MODEL_PATH}")
+    gender_tokenizer = AutoTokenizer.from_pretrained(GENDER_MODEL_PATH, token=hf_token)
+    gender_model = AutoModelForCausalLM.from_pretrained(GENDER_MODEL_PATH, token=hf_token).to(device)
+    print("✅ Gender verifier model loaded successfully!")
+    print(f"Loading base model for grammar correction from cache: {BASE_MODEL_PATH}")
+    base_model = AutoModelForCausalLM.from_pretrained(
+        BASE_MODEL_PATH,
+        token=hf_token,
+        dtype=torch.float32,
+    ).to(device)
+    grammar_tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_PATH, token=hf_token)
+    print(f"Applying LoRA adapter from cache: {LORA_ADAPTER_PATH}")
+    grammar_model = PeftModel.from_pretrained(base_model, LORA_ADAPTER_PATH, token=hf_token).to(device)
+    print("✅ Grammar correction model loaded successfully!")
+    if grammar_tokenizer.pad_token is None:
+        grammar_tokenizer.pad_token = grammar_tokenizer.eos_token
+    if gender_tokenizer.pad_token is None:
+        gender_tokenizer.pad_token = gender_tokenizer.eos_token
+except Exception as e:
+    print(f"❌ Critical error during model loading: {e}")
+    grammar_model = None
+    gender_model = None
+print("--- Model Loading Complete ---")
+# --- FastAPI Application Setup ---
+app = FastAPI(title="Text Correction API")
+class CorrectionRequest(BaseModel):
+    text: str
+class CorrectionResponse(BaseModel):
+    original_text: str
+    corrected_text: str
+# --- Helper Functions for Text Cleaning ---
+def clean_grammar_response(text: str) -> str:
+    if "Response:" in text:
+        parts = text.split("Response:")
+        if len(parts) > 1: return parts[1].strip()
+    return text.strip()
+def clean_gender_response(text: str) -> str:
+    if "Response:" in text:
+        parts = text.split("Response:")
+        if len(parts) > 1: text = parts[1].strip()
+    text = re.sub(r'^(Corrected sentence:|Correct:|Prompt:)\s*', '', text, flags=re.IGNORECASE)
+    return text.strip().strip('"')
+def correct_gender_rules(text: str) -> str:
+    corrections = {
+        r'\bher wife\b': 'her husband', r'\bhis husband\b': 'his wife',
+        r'\bhe is a girl\b': 'he is a boy', r'\bshe is a boy\b': 'she is a girl'
+    }
+    for pattern, replacement in corrections.items():
+        text = re.sub(pattern, replacement, text, flags=re.IGNORECASE)
+    return text
+# --- API Endpoints ---
+@app.post("/correct_grammar", response_model=CorrectionResponse)
+async def handle_grammar_correction(request: CorrectionRequest):
+    if not grammar_model or not grammar_tokenizer:
+        raise HTTPException(status_code=503, detail="Grammar model is not available.")
+    prompt_text = request.text
+    input_text = f"Prompt: {prompt_text}\nResponse:"
+    inputs = grammar_tokenizer(input_text, return_tensors="pt").to(device)
+    output_ids = grammar_model.generate(**inputs, max_new_tokens=64, do_sample=False)
+    output_text = grammar_tokenizer.decode(output_ids[0], skip_special_tokens=True)
+    corrected = clean_grammar_response(output_text)
+    return CorrectionResponse(original_text=prompt_text, corrected_text=corrected)
+@app.post("/correct_gender", response_model=CorrectionResponse)
+async def handle_gender_correction(request: CorrectionRequest):
+    if not gender_model or not gender_tokenizer:
+        raise HTTPException(status_code=503, detail="Gender model is not available.")
+    prompt_text = request.text
+    input_text = f"Prompt: Please rewrite the sentence with correct grammar and gender. Output ONLY the corrected sentence:\n{prompt_text}\nResponse:"
+    inputs = gender_tokenizer(input_text, return_tensors="pt").to(device)
+    output_ids = gender_model.generate(
+        **inputs, max_new_tokens=64, temperature=0.0,
+        do_sample=False, eos_token_id=gender_tokenizer.eos_token_id
+    )
+    output_text = gender_tokenizer.decode(output_ids[0], skip_special_tokens=True)
+    cleaned_from_model = clean_gender_response(output_text)
+    final_correction = correct_gender_rules(cleaned_from_model)
+    return CorrectionResponse(original_text=prompt_text, corrected_text=final_correction)
+@app.get("/")
+def read_root():
+    return {"status": "Text Correction API is running."}

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+fastapi
+uvicorn[standard]
+torch
+transformers
+peft
+accelerate
+pydantic
+sentencepiece

requirements_local.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+fastapi
+uvicorn[standard]
+torch
+transformers
+peft
+python-docx
+fpdf2
+textract