Spaces:
Sleeping
Sleeping
File size: 2,546 Bytes
8fb878c 713fbcf 8fb878c 100a2d3 12e3ddb 44081bf 100a2d3 12e3ddb 8fb878c 54aa11c 8fb878c 54aa11c 8fb878c 21f5e3b 8fb878c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 |
import os
import io
import json
from google.cloud import vision
from dotenv import load_dotenv
from groq import Groq
load_dotenv()
# Load credentials from env variable
# Save secret JSON string to a temporary file
gcv_json_str = os.environ.get("GCV_JSON")
if gcv_json_str:
temp_path = "/tmp/gcv_temp.json"
with open(temp_path, "w") as f:
f.write(gcv_json_str)
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = temp_path
client = Groq(api_key=os.getenv("GROQ_API_KEY"))
def run_ocr_with_gcv(image_path):
client_vision = vision.ImageAnnotatorClient()
with io.open(image_path, 'rb') as image_file:
content = image_file.read()
image = vision.Image(content=content)
response = client_vision.document_text_detection(image=image)
return response.full_text_annotation.text
def extract_table_from_text(text,max_tokens=4096,model="meta-llama/llama-4-scout-17b-16e-instruct"):
prompt = f"""
Extract a structured table of items from the invoice text below.
- First findout what are the table column names
- The table should include all items under column names.
-
If some values are missing, fill as "N/A".
Output the table in Markdown format. Only return the table.
Invoice Text:
\"\"\"
{text}
\"\"\"
"""
response = client.chat.completions.create(
model=model,
messages=[
{"role": "system", "content": "You are a professional invoice data extractor."},
{"role": "user", "content": prompt}
],
temperature=1,
max_completion_tokens=max_tokens,
top_p=1,
)
return response.choices[0].message.content
import pandas as pd
from io import StringIO
def extract_markdown_table(output_text):
# Step 1: Try to find the first line that starts with '|'
lines = output_text.strip().split('\n')
table_lines = [line for line in lines if '|' in line and line.count('|') > 1]
if not table_lines or len(table_lines) < 2:
raise ValueError("❌ No markdown table found in output.")
# Step 2: Remove markdown header separator if exists
if '---' in table_lines[1]:
table_lines = [table_lines[0]] + table_lines[2:]
# Step 3: Clean and convert to CSV
cleaned_md = "\n".join(table_lines)
df = pd.read_csv(StringIO(cleaned_md), sep='|', engine='python')
df = df.dropna(axis=1, how='all') # remove empty columns
df.columns = [col.strip() for col in df.columns]
df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)
return df
|