| """ |
| Debug script to test document analysis extraction |
| """ |
| import asyncio |
| import sys |
| from pathlib import Path |
|
|
| |
| sys.path.insert(0, str(Path(__file__).parent)) |
|
|
| from src.llm.langchain_ollama_client import get_langchain_client |
| from src.agents.scenario1.document_analysis_agent import DocumentAnalysisAgent |
| from loguru import logger |
|
|
| async def main(): |
| |
| patent_path = "uploads/patents" |
|
|
| |
| patent_files = list(Path(patent_path).glob("*.pdf")) |
| if not patent_files: |
| logger.error(f"No patent PDFs found in {patent_path}") |
| return |
|
|
| test_patent = str(patent_files[0]) |
| logger.info(f"Testing with patent: {test_patent}") |
|
|
| |
| llm_client = get_langchain_client(default_complexity='standard') |
| agent = DocumentAnalysisAgent(llm_client) |
|
|
| |
| logger.info("Step 1: Extracting text...") |
| patent_text = await agent._extract_patent_text(test_patent) |
| logger.info(f"Extracted text length: {len(patent_text)} characters") |
| logger.info(f"First 500 chars: {patent_text[:500]}") |
|
|
| |
| logger.info("\nStep 2: Extracting structure...") |
| from langchain_core.output_parsers import JsonOutputParser |
| parser = JsonOutputParser() |
|
|
| try: |
| structure = await agent.structure_chain.ainvoke({ |
| "patent_text": patent_text[:8000], |
| "format_instructions": parser.get_format_instructions() |
| }) |
|
|
| logger.info(f"\nExtracted structure:") |
| logger.info(f" Title: {structure.get('title', 'NOT FOUND')}") |
| logger.info(f" Abstract: {structure.get('abstract', 'NOT FOUND')[:200] if structure.get('abstract') else 'NOT FOUND'}") |
| logger.info(f" Patent ID: {structure.get('patent_id', 'NOT FOUND')}") |
| logger.info(f" Independent claims: {len(structure.get('independent_claims', []))}") |
| logger.info(f" Dependent claims: {len(structure.get('dependent_claims', []))}") |
| logger.info(f"\nFull structure keys: {structure.keys()}") |
|
|
| except Exception as e: |
| logger.error(f"Structure extraction failed: {e}") |
| import traceback |
| traceback.print_exc() |
|
|
| if __name__ == "__main__": |
| asyncio.run(main()) |
|
|