File size: 2,546 Bytes
8fb878c
 
713fbcf
8fb878c
 
 
 
 
 
 
100a2d3
 
 
 
12e3ddb
44081bf
100a2d3
12e3ddb
8fb878c
 
 
 
 
 
 
 
 
 
 
 
54aa11c
8fb878c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54aa11c
8fb878c
 
 
 
 
21f5e3b
8fb878c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import os
import io
import json
from google.cloud import vision
from dotenv import load_dotenv
from groq import Groq

load_dotenv()

# Load credentials from env variable

# Save secret JSON string to a temporary file
gcv_json_str = os.environ.get("GCV_JSON")
if gcv_json_str:
    temp_path = "/tmp/gcv_temp.json"
    with open(temp_path, "w") as f:
        f.write(gcv_json_str)
    os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = temp_path

client = Groq(api_key=os.getenv("GROQ_API_KEY"))

def run_ocr_with_gcv(image_path):
    client_vision = vision.ImageAnnotatorClient()
    with io.open(image_path, 'rb') as image_file:
        content = image_file.read()

    image = vision.Image(content=content)
    response = client_vision.document_text_detection(image=image)
    return response.full_text_annotation.text

def extract_table_from_text(text,max_tokens=4096,model="meta-llama/llama-4-scout-17b-16e-instruct"):
    prompt = f"""
    Extract a structured table of items from the invoice text below. 
    - First findout what are the table column names
    - The table should include all items under column names.
    -
    If some values are missing, fill as "N/A".

    Output the table in Markdown format. Only return the table.

    Invoice Text:
    \"\"\"
    {text}
    \"\"\"
    """
    response = client.chat.completions.create(
        model=model,
    messages=[
        {"role": "system", "content": "You are a professional invoice data extractor."},
        {"role": "user", "content": prompt}
    ],
        temperature=1,
        max_completion_tokens=max_tokens,
        top_p=1,
    )
    return response.choices[0].message.content


import pandas as pd
from io import StringIO


def extract_markdown_table(output_text):
    # Step 1: Try to find the first line that starts with '|'
    lines = output_text.strip().split('\n')
    table_lines = [line for line in lines if '|' in line and line.count('|') > 1]

    if not table_lines or len(table_lines) < 2:
        raise ValueError("❌ No markdown table found in output.")

    # Step 2: Remove markdown header separator if exists
    if '---' in table_lines[1]:
        table_lines = [table_lines[0]] + table_lines[2:]

    # Step 3: Clean and convert to CSV
    cleaned_md = "\n".join(table_lines)
    df = pd.read_csv(StringIO(cleaned_md), sep='|', engine='python')
    df = df.dropna(axis=1, how='all')  # remove empty columns
    df.columns = [col.strip() for col in df.columns]
    df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)

    return df