Spaces:

RameshJ
/

Invoices_to_Table

Sleeping

File size: 2,546 Bytes

8fb878c
 
713fbcf
8fb878c
 
 
 
 
 
 
100a2d3
 
 
 
12e3ddb
44081bf
100a2d3
12e3ddb
8fb878c
 
 
 
 
 
 
 
 
 
 
 
54aa11c
8fb878c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54aa11c
8fb878c
 
 
 
 
21f5e3b
8fb878c

import os
import io
import json
from google.cloud import vision
from dotenv import load_dotenv
from groq import Groq

load_dotenv()

# Load credentials from env variable

# Save secret JSON string to a temporary file
gcv_json_str = os.environ.get("GCV_JSON")
if gcv_json_str:
    temp_path = "/tmp/gcv_temp.json"
    with open(temp_path, "w") as f:
        f.write(gcv_json_str)
    os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = temp_path

client = Groq(api_key=os.getenv("GROQ_API_KEY"))

def run_ocr_with_gcv(image_path):
    client_vision = vision.ImageAnnotatorClient()
    with io.open(image_path, 'rb') as image_file:
        content = image_file.read()

    image = vision.Image(content=content)
    response = client_vision.document_text_detection(image=image)
    return response.full_text_annotation.text

def extract_table_from_text(text,max_tokens=4096,model="meta-llama/llama-4-scout-17b-16e-instruct"):
    prompt = f"""
    Extract a structured table of items from the invoice text below. 
    - First findout what are the table column names
    - The table should include all items under column names.
    -
    If some values are missing, fill as "N/A".

    Output the table in Markdown format. Only return the table.

    Invoice Text:
    \"\"\"
    {text}
    \"\"\"
    """
    response = client.chat.completions.create(
        model=model,
    messages=[
        {"role": "system", "content": "You are a professional invoice data extractor."},
        {"role": "user", "content": prompt}
    ],
        temperature=1,
        max_completion_tokens=max_tokens,
        top_p=1,
    )
    return response.choices[0].message.content


import pandas as pd
from io import StringIO


def extract_markdown_table(output_text):
    # Step 1: Try to find the first line that starts with '|'
    lines = output_text.strip().split('\n')
    table_lines = [line for line in lines if '|' in line and line.count('|') > 1]

    if not table_lines or len(table_lines) < 2:
        raise ValueError("❌ No markdown table found in output.")

    # Step 2: Remove markdown header separator if exists
    if '---' in table_lines[1]:
        table_lines = [table_lines[0]] + table_lines[2:]

    # Step 3: Clean and convert to CSV
    cleaned_md = "\n".join(table_lines)
    df = pd.read_csv(StringIO(cleaned_md), sep='|', engine='python')
    df = df.dropna(axis=1, how='all')  # remove empty columns
    df.columns = [col.strip() for col in df.columns]
    df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)

    return df