BirdScopeAI / langgraph_agent /structured_output.py
facemelter's picture
Fixing agent url hallucinations
0588003 verified
"""
Structured output parsing using LlamaIndex Pydantic Programs.
Ensures consistent image formatting in agent responses.
HACKATHON OPTIMIZED: Uses regex extraction instead of LLM calls for speed.
"""
from typing import List, Optional
import re
from pydantic import BaseModel, Field
class BirdIdentificationResponse(BaseModel):
"""Structured response for bird identification using LlamaIndex Pydantic."""
summary: str = Field(
description="Main response text with bird identification, facts, or information"
)
species_name: Optional[str] = Field(
default=None,
description="Common name of the bird species (e.g., 'Northern Cardinal')"
)
image_urls: List[str] = Field(
default_factory=list,
description="List of image URLs to display for this bird"
)
audio_urls: List[str] = Field(
default_factory=list,
description="List of audio URLs (bird calls/songs)"
)
confidence_score: Optional[float] = Field(
default=None,
description="Confidence score from classifier (0.0-1.0)"
)
def extract_urls_from_text(text: str) -> tuple[List[str], List[str]]:
"""
Extract image and audio URLs from text using regex.
Updated to handle URLs within markdown, JSON, and plain text.
Supports both extension-based URLs (.jpg, .png) and domain-based (Unsplash).
Returns:
tuple: (image_urls, audio_urls)
"""
# Pattern 1: Image URLs with file extensions
# Matches URLs ending in image extensions, allowing most characters before the extension
# Stops at whitespace or common delimiters like ), ], }
image_pattern_ext = r'https?://[^\s)}\]]+?\.(?:jpg|jpeg|png|gif|webp|svg)(?:\?[^\s)}\]]*)?'
# Pattern 2: Unsplash image URLs (no file extension needed)
# Matches: https://images.unsplash.com/photo-XXXXXXX or similar
image_pattern_unsplash = r'https?://images\.unsplash\.com/[^\s)}\]]*'
# Pattern for audio URLs - handles both direct audio files AND xeno-canto links
# Updated to be more permissive like image pattern
audio_pattern_files = r'https?://[^\s)}\]]+?\.(?:mp3|wav|ogg|m4a)(?:\?[^\s)}\]]*)?'
audio_pattern_xenocanto = r'https?://xeno-canto\.org/\d+(?:/download)?'
print(f"[EXTRACT_URLS] Searching text of length {len(text)}")
# Extract all URLs - combine both image patterns
raw_image_urls_ext = re.findall(image_pattern_ext, text, re.IGNORECASE)
raw_image_urls_unsplash = re.findall(image_pattern_unsplash, text, re.IGNORECASE)
raw_audio_urls_files = re.findall(audio_pattern_files, text, re.IGNORECASE)
audio_urls_xenocanto = list(set(re.findall(audio_pattern_xenocanto, text, re.IGNORECASE)))
# Combine image URLs from both patterns
raw_image_urls = raw_image_urls_ext + raw_image_urls_unsplash
print(f"[EXTRACT_URLS] Found {len(raw_image_urls_ext)} extension-based image URLs")
print(f"[EXTRACT_URLS] Found {len(raw_image_urls_unsplash)} Unsplash image URLs")
print(f"[EXTRACT_URLS] Found {len(raw_audio_urls_files)} audio file URLs")
print(f"[EXTRACT_URLS] Found {len(audio_urls_xenocanto)} xeno-canto URLs")
# Clean URLs (remove trailing quotes, commas, etc.)
def clean_url(url: str) -> str:
cleaned = url.rstrip('",;)')
# Validate it's still a proper URL
if cleaned.startswith('http://') or cleaned.startswith('https://'):
return cleaned
else:
print(f"[EXTRACT_URLS] ⚠️ Rejected malformed URL after cleaning: {cleaned}")
return None
image_urls = [u for u in (clean_url(url) for url in raw_image_urls) if u is not None]
image_urls = list(set(image_urls)) # Deduplicate
audio_urls_files = [u for u in (clean_url(url) for url in raw_audio_urls_files) if u is not None]
audio_urls_files = list(set(audio_urls_files)) # Deduplicate
# Combine both types of audio URLs
audio_urls = audio_urls_files + audio_urls_xenocanto
# Log the actual URLs extracted
print(f"[EXTRACT_URLS] βœ… Cleaned image URLs ({len(image_urls)}): {image_urls}")
print(f"[EXTRACT_URLS] βœ… Cleaned audio URLs ({len(audio_urls)}): {audio_urls}")
return image_urls, audio_urls
def extract_species_name(text: str) -> Optional[str]:
"""
Try to extract species name from common patterns in response.
"""
# Pattern: "identified as SPECIES NAME" or "species: SPECIES NAME"
patterns = [
r'identified as[:\s]+([A-Z][a-z]+(?:\s+[A-Z][a-z]+){0,3})',
r'species[:\s]+([A-Z][a-z]+(?:\s+[A-Z][a-z]+){0,3})',
r'This is (?:a |an )?([A-Z][a-z]+(?:\s+[A-Z][a-z]+){0,3})',
]
for pattern in patterns:
match = re.search(pattern, text)
if match:
return match.group(1)
return None
async def parse_agent_response(
raw_response: str,
provider: str,
api_key: str,
model: str
) -> str:
"""
Parse agent response into structured format and reformat with guaranteed markdown.
OPTIMIZED FOR HACKATHON: Uses regex extraction instead of LLM call.
Still uses LlamaIndex Pydantic models for structured data.
Args:
raw_response: The agent's raw text response
provider: LLM provider ("openai", "anthropic", "huggingface")
api_key: API key (unused in optimized version)
model: Model name (unused in optimized version)
Returns:
Formatted markdown response with guaranteed image syntax
"""
try:
print("[STRUCTURED OUTPUT] Starting parsing...")
print(f"[STRUCTURED OUTPUT] Raw response length: {len(raw_response)} characters")
print(f"[STRUCTURED OUTPUT] First 500 chars: {raw_response[:500]}")
print(f"[STRUCTURED OUTPUT] Last 500 chars: {raw_response[-500:]}")
# Extract URLs using regex (fast, no API call)
image_urls, audio_urls = extract_urls_from_text(raw_response)
print(f"[STRUCTURED OUTPUT] Found {len(image_urls)} images, {len(audio_urls)} audio files")
# Extract species name if possible
species_name = extract_species_name(raw_response)
# Create structured response using LlamaIndex Pydantic model
structured = BirdIdentificationResponse(
summary=raw_response, # Keep full response as summary
species_name=species_name,
image_urls=image_urls,
audio_urls=audio_urls,
confidence_score=None # Could extract with regex if needed
)
# Check if we found any media to format
if not structured.image_urls and not structured.audio_urls:
print("[STRUCTURED OUTPUT] No images or audio found, returning original")
return raw_response
# Reformat into markdown with guaranteed images
formatted_parts = []
# Main summary (but remove already-formatted images/audio to avoid duplication)
clean_summary = raw_response
for url in image_urls:
# Remove existing markdown images
clean_summary = re.sub(rf'!\[([^\]]*)\]\({re.escape(url)}\)', '', clean_summary)
# Remove plain URLs
clean_summary = clean_summary.replace(url, '')
for url in audio_urls:
# Remove audio URLs from summary
clean_summary = clean_summary.replace(url, '')
formatted_parts.append(clean_summary.strip())
# Add images with markdown syntax
if structured.image_urls:
formatted_parts.append("\n### Images\n")
for idx, url in enumerate(structured.image_urls, 1):
# Use species name if available, otherwise generic
alt_text = structured.species_name or f"Bird {idx}"
img_markdown = f"![{alt_text}]({url})"
print(f"[STRUCTURED OUTPUT] Generated image markdown: {img_markdown}")
formatted_parts.append(img_markdown)
# Add audio links if present
if structured.audio_urls:
formatted_parts.append("\n### Audio Recordings\n")
for idx, url in enumerate(structured.audio_urls, 1):
# Strip /download from xeno-canto URLs for browser-friendly links
display_url = url.replace("/download", "") if "xeno-canto.org" in url else url
formatted_parts.append(f"πŸ”Š [Listen to recording {idx}]({display_url})")
result = "\n\n".join(formatted_parts)
print(f"[STRUCTURED OUTPUT] βœ… Successfully formatted response")
print(f"[STRUCTURED OUTPUT] Final markdown length: {len(result)} characters")
print(f"[STRUCTURED OUTPUT] Final markdown (last 500 chars): {result[-500:]}")
return result
except Exception as e:
# Fallback: return original response if parsing fails
print(f"[STRUCTURED OUTPUT] ❌ Parsing failed: {e}")
return raw_response