| """ |
| Document type validation utility |
| Helps identify if uploaded documents are actually patents |
| """ |
| import re |
| from pathlib import Path |
| from typing import Tuple, List |
| from loguru import logger |
|
|
|
|
| class DocumentValidator: |
| """Validate that uploaded documents are patents""" |
|
|
| |
| PATENT_KEYWORDS = [ |
| 'patent', 'claim', 'claims', 'invention', 'abstract', |
| 'field of invention', 'background', 'detailed description', |
| 'inventor', 'assignee', 'filing date', 'application' |
| ] |
|
|
| |
| REQUIRED_SECTIONS = ['abstract', 'claim'] |
|
|
| @staticmethod |
| def validate_patent_document(text: str) -> Tuple[bool, List[str]]: |
| """ |
| Validate if document text appears to be a patent |
| |
| Args: |
| text: Extracted document text |
| |
| Returns: |
| Tuple of (is_valid, issues_found) |
| """ |
| text_lower = text.lower() |
| issues = [] |
|
|
| |
| if len(text) < 500: |
| issues.append("Document too short (< 500 characters)") |
|
|
| |
| keyword_matches = sum(1 for kw in DocumentValidator.PATENT_KEYWORDS |
| if kw in text_lower) |
|
|
| if keyword_matches < 3: |
| issues.append(f"Only {keyword_matches} patent keywords found (expected at least 3)") |
|
|
| |
| missing_sections = [section for section in DocumentValidator.REQUIRED_SECTIONS |
| if section not in text_lower] |
|
|
| if missing_sections: |
| issues.append(f"Missing required sections: {', '.join(missing_sections)}") |
|
|
| |
| claim_pattern = r'claim\s+\d+' |
| claims_found = len(re.findall(claim_pattern, text_lower)) |
|
|
| if claims_found == 0: |
| issues.append("No numbered claims found") |
|
|
| |
| is_valid = len(issues) == 0 or (keyword_matches >= 3 and claims_found > 0) |
|
|
| if not is_valid: |
| logger.warning(f"Document validation failed: {issues}") |
|
|
| return is_valid, issues |
|
|
| @staticmethod |
| def identify_document_type(text: str) -> str: |
| """ |
| Try to identify what type of document this is |
| |
| Returns: |
| Document type description |
| """ |
| text_lower = text.lower() |
|
|
| |
| if 'microsoft' in text_lower and 'windows' in text_lower: |
| return "Microsoft Windows documentation" |
|
|
| if any(term in text_lower for term in ['press release', 'news', 'announcement']): |
| return "Press release or news article" |
|
|
| if any(term in text_lower for term in ['whitepaper', 'white paper', 'technical report']): |
| return "Technical whitepaper or report" |
|
|
| if any(term in text_lower for term in ['terms of service', 'privacy policy', 'license agreement']): |
| return "Legal agreement or policy document" |
|
|
| if 'research paper' in text_lower or 'ieee' in text_lower or 'conference' in text_lower: |
| return "Academic research paper" |
|
|
| |
| is_patent, _ = DocumentValidator.validate_patent_document(text) |
| if is_patent: |
| return "Patent document" |
|
|
| return "Unknown document type (not a patent)" |
|
|
|
|
| def validate_and_log(text: str, document_name: str = "document") -> bool: |
| """ |
| Convenience function to validate and log results |
| |
| Args: |
| text: Document text |
| document_name: Name of document for logging |
| |
| Returns: |
| True if valid patent, False otherwise |
| """ |
| is_valid, issues = DocumentValidator.validate_patent_document(text) |
|
|
| if not is_valid: |
| doc_type = DocumentValidator.identify_document_type(text) |
| logger.error(f"❌ {document_name} is NOT a valid patent") |
| logger.error(f" Detected type: {doc_type}") |
| logger.error(f" Issues: {', '.join(issues)}") |
| return False |
|
|
| logger.success(f"✅ {document_name} appears to be a valid patent") |
| return True |
|
|