Spaces:

MCP-1st-Birthday
/

BirdScopeAI

Paused

App Files Files Community

BirdScopeAI / langgraph_agent /structured_output.py

facemelter

Fixing agent url hallucinations

0588003 verified 25 days ago

raw

history blame contribute delete

8.87 kB

	"""
	Structured output parsing using LlamaIndex Pydantic Programs.
	Ensures consistent image formatting in agent responses.

	HACKATHON OPTIMIZED: Uses regex extraction instead of LLM calls for speed.
	"""
	from typing import List, Optional
	import re
	from pydantic import BaseModel, Field


	class BirdIdentificationResponse(BaseModel):
	"""Structured response for bird identification using LlamaIndex Pydantic."""

	summary: str = Field(
	description="Main response text with bird identification, facts, or information"
	)
	species_name: Optional[str] = Field(
	default=None,
	description="Common name of the bird species (e.g., 'Northern Cardinal')"
	)
	image_urls: List[str] = Field(
	default_factory=list,
	description="List of image URLs to display for this bird"
	)
	audio_urls: List[str] = Field(
	default_factory=list,
	description="List of audio URLs (bird calls/songs)"
	)
	confidence_score: Optional[float] = Field(
	default=None,
	description="Confidence score from classifier (0.0-1.0)"
	)


	def extract_urls_from_text(text: str) -> tuple[List[str], List[str]]:
	"""
	Extract image and audio URLs from text using regex.

	Updated to handle URLs within markdown, JSON, and plain text.
	Supports both extension-based URLs (.jpg, .png) and domain-based (Unsplash).

	Returns:
	tuple: (image_urls, audio_urls)
	"""
	# Pattern 1: Image URLs with file extensions
	# Matches URLs ending in image extensions, allowing most characters before the extension
	# Stops at whitespace or common delimiters like ), ], }
	image_pattern_ext = r'https?://[^\s)}\]]+?\.(?:jpg\|jpeg\|png\|gif\|webp\|svg)(?:\?[^\s)}\]]*)?'

	# Pattern 2: Unsplash image URLs (no file extension needed)
	# Matches: https://images.unsplash.com/photo-XXXXXXX or similar
	image_pattern_unsplash = r'https?://images\.unsplash\.com/[^\s)}\]]*'

	# Pattern for audio URLs - handles both direct audio files AND xeno-canto links
	# Updated to be more permissive like image pattern
	audio_pattern_files = r'https?://[^\s)}\]]+?\.(?:mp3\|wav\|ogg\|m4a)(?:\?[^\s)}\]]*)?'
	audio_pattern_xenocanto = r'https?://xeno-canto\.org/\d+(?:/download)?'

	print(f"[EXTRACT_URLS] Searching text of length {len(text)}")

	# Extract all URLs - combine both image patterns
	raw_image_urls_ext = re.findall(image_pattern_ext, text, re.IGNORECASE)
	raw_image_urls_unsplash = re.findall(image_pattern_unsplash, text, re.IGNORECASE)
	raw_audio_urls_files = re.findall(audio_pattern_files, text, re.IGNORECASE)
	audio_urls_xenocanto = list(set(re.findall(audio_pattern_xenocanto, text, re.IGNORECASE)))

	# Combine image URLs from both patterns
	raw_image_urls = raw_image_urls_ext + raw_image_urls_unsplash

	print(f"[EXTRACT_URLS] Found {len(raw_image_urls_ext)} extension-based image URLs")
	print(f"[EXTRACT_URLS] Found {len(raw_image_urls_unsplash)} Unsplash image URLs")
	print(f"[EXTRACT_URLS] Found {len(raw_audio_urls_files)} audio file URLs")
	print(f"[EXTRACT_URLS] Found {len(audio_urls_xenocanto)} xeno-canto URLs")

	# Clean URLs (remove trailing quotes, commas, etc.)
	def clean_url(url: str) -> str:
	cleaned = url.rstrip('",;)')
	# Validate it's still a proper URL
	if cleaned.startswith('http://') or cleaned.startswith('https://'):
	return cleaned
	else:
	print(f"[EXTRACT_URLS] ⚠️ Rejected malformed URL after cleaning: {cleaned}")
	return None

	image_urls = [u for u in (clean_url(url) for url in raw_image_urls) if u is not None]
	image_urls = list(set(image_urls)) # Deduplicate

	audio_urls_files = [u for u in (clean_url(url) for url in raw_audio_urls_files) if u is not None]
	audio_urls_files = list(set(audio_urls_files)) # Deduplicate

	# Combine both types of audio URLs
	audio_urls = audio_urls_files + audio_urls_xenocanto

	# Log the actual URLs extracted
	print(f"[EXTRACT_URLS] ✅ Cleaned image URLs ({len(image_urls)}): {image_urls}")
	print(f"[EXTRACT_URLS] ✅ Cleaned audio URLs ({len(audio_urls)}): {audio_urls}")

	return image_urls, audio_urls


	def extract_species_name(text: str) -> Optional[str]:
	"""
	Try to extract species name from common patterns in response.
	"""
	# Pattern: "identified as SPECIES NAME" or "species: SPECIES NAME"
	patterns = [
	r'identified as[:\s]+([A-Z][a-z]+(?:\s+[A-Z][a-z]+){0,3})',
	r'species[:\s]+([A-Z][a-z]+(?:\s+[A-Z][a-z]+){0,3})',
	r'This is (?:a \|an )?([A-Z][a-z]+(?:\s+[A-Z][a-z]+){0,3})',
	]

	for pattern in patterns:
	match = re.search(pattern, text)
	if match:
	return match.group(1)

	return None


	async def parse_agent_response(
	raw_response: str,
	provider: str,
	api_key: str,
	model: str
	) -> str:
	"""
	Parse agent response into structured format and reformat with guaranteed markdown.

	OPTIMIZED FOR HACKATHON: Uses regex extraction instead of LLM call.
	Still uses LlamaIndex Pydantic models for structured data.

	Args:
	raw_response: The agent's raw text response
	provider: LLM provider ("openai", "anthropic", "huggingface")
	api_key: API key (unused in optimized version)
	model: Model name (unused in optimized version)

	Returns:
	Formatted markdown response with guaranteed image syntax
	"""
	try:
	print("[STRUCTURED OUTPUT] Starting parsing...")
	print(f"[STRUCTURED OUTPUT] Raw response length: {len(raw_response)} characters")
	print(f"[STRUCTURED OUTPUT] First 500 chars: {raw_response[:500]}")
	print(f"[STRUCTURED OUTPUT] Last 500 chars: {raw_response[-500:]}")

	# Extract URLs using regex (fast, no API call)
	image_urls, audio_urls = extract_urls_from_text(raw_response)

	print(f"[STRUCTURED OUTPUT] Found {len(image_urls)} images, {len(audio_urls)} audio files")

	# Extract species name if possible
	species_name = extract_species_name(raw_response)

	# Create structured response using LlamaIndex Pydantic model
	structured = BirdIdentificationResponse(
	summary=raw_response, # Keep full response as summary
	species_name=species_name,
	image_urls=image_urls,
	audio_urls=audio_urls,
	confidence_score=None # Could extract with regex if needed
	)

	# Check if we found any media to format
	if not structured.image_urls and not structured.audio_urls:
	print("[STRUCTURED OUTPUT] No images or audio found, returning original")
	return raw_response

	# Reformat into markdown with guaranteed images
	formatted_parts = []

	# Main summary (but remove already-formatted images/audio to avoid duplication)
	clean_summary = raw_response
	for url in image_urls:
	# Remove existing markdown images
	clean_summary = re.sub(rf'!\[([^\]]*)\]\({re.escape(url)}\)', '', clean_summary)
	# Remove plain URLs
	clean_summary = clean_summary.replace(url, '')

	for url in audio_urls:
	# Remove audio URLs from summary
	clean_summary = clean_summary.replace(url, '')

	formatted_parts.append(clean_summary.strip())

	# Add images with markdown syntax
	if structured.image_urls:
	formatted_parts.append("\n### Images\n")
	for idx, url in enumerate(structured.image_urls, 1):
	# Use species name if available, otherwise generic
	alt_text = structured.species_name or f"Bird {idx}"
	img_markdown = f"![{alt_text}]({url})"
	print(f"[STRUCTURED OUTPUT] Generated image markdown: {img_markdown}")
	formatted_parts.append(img_markdown)

	# Add audio links if present
	if structured.audio_urls:
	formatted_parts.append("\n### Audio Recordings\n")
	for idx, url in enumerate(structured.audio_urls, 1):
	# Strip /download from xeno-canto URLs for browser-friendly links
	display_url = url.replace("/download", "") if "xeno-canto.org" in url else url
	formatted_parts.append(f"🔊 [Listen to recording {idx}]({display_url})")

	result = "\n\n".join(formatted_parts)
	print(f"[STRUCTURED OUTPUT] ✅ Successfully formatted response")
	print(f"[STRUCTURED OUTPUT] Final markdown length: {len(result)} characters")
	print(f"[STRUCTURED OUTPUT] Final markdown (last 500 chars): {result[-500:]}")
	return result

	except Exception as e:
	# Fallback: return original response if parsing fails
	print(f"[STRUCTURED OUTPUT] ❌ Parsing failed: {e}")
	return raw_response