Commit
·
92a45c5
1
Parent(s):
9e0e49b
Update data/validation_schemas.py
Browse files- data/validation_schemas.py +32 -25
data/validation_schemas.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
-
# File: data/validation_schemas.py
|
| 2 |
# Comprehensive Pydantic validation schemas for data quality assurance
|
| 3 |
|
| 4 |
-
from pydantic import BaseModel, Field,
|
| 5 |
from typing import List, Optional, Dict, Any, Union
|
| 6 |
from datetime import datetime
|
| 7 |
import re
|
|
@@ -53,7 +53,8 @@ class TextContentSchema(BaseModel):
|
|
| 53 |
description="The news article text content"
|
| 54 |
)
|
| 55 |
|
| 56 |
-
@
|
|
|
|
| 57 |
def validate_text_content(cls, v):
|
| 58 |
"""Comprehensive text content validation"""
|
| 59 |
if not v or not isinstance(v, str):
|
|
@@ -155,7 +156,8 @@ class LabelSchema(BaseModel):
|
|
| 155 |
description="Source reliability score (0-1)"
|
| 156 |
)
|
| 157 |
|
| 158 |
-
@
|
|
|
|
| 159 |
def validate_label(cls, v):
|
| 160 |
"""Validate label value"""
|
| 161 |
if v not in [0, 1]:
|
|
@@ -188,7 +190,8 @@ class DataSourceSchema(BaseModel):
|
|
| 188 |
description="Batch identifier for grouped data"
|
| 189 |
)
|
| 190 |
|
| 191 |
-
@
|
|
|
|
| 192 |
def validate_url(cls, v):
|
| 193 |
"""Validate URL format"""
|
| 194 |
if v is not None:
|
|
@@ -244,7 +247,8 @@ class NewsArticleSchema(BaseModel):
|
|
| 244 |
description="Overall quality score (0-1)"
|
| 245 |
)
|
| 246 |
|
| 247 |
-
@
|
|
|
|
| 248 |
def validate_title(cls, v):
|
| 249 |
"""Validate article title"""
|
| 250 |
if v is not None:
|
|
@@ -259,7 +263,8 @@ class NewsArticleSchema(BaseModel):
|
|
| 259 |
|
| 260 |
return v
|
| 261 |
|
| 262 |
-
@
|
|
|
|
| 263 |
def validate_language(cls, v):
|
| 264 |
"""Validate language code"""
|
| 265 |
valid_languages = ['en', 'es', 'fr', 'de', 'it', 'pt', 'nl', 'ru', 'zh', 'ja', 'ko']
|
|
@@ -267,11 +272,11 @@ class NewsArticleSchema(BaseModel):
|
|
| 267 |
raise ValueError(f"Unsupported language code: {v}")
|
| 268 |
return v
|
| 269 |
|
| 270 |
-
@
|
| 271 |
-
def validate_article_consistency(
|
| 272 |
"""Cross-field validation"""
|
| 273 |
-
text_content =
|
| 274 |
-
title =
|
| 275 |
|
| 276 |
if text_content and title:
|
| 277 |
# Check if title and content are suspiciously similar
|
|
@@ -282,7 +287,7 @@ class NewsArticleSchema(BaseModel):
|
|
| 282 |
# This is fine, just noting high similarity
|
| 283 |
pass
|
| 284 |
|
| 285 |
-
return
|
| 286 |
|
| 287 |
@property
|
| 288 |
def text_quality_level(self) -> TextQualityLevel:
|
|
@@ -338,8 +343,8 @@ class BatchValidationSchema(BaseModel):
|
|
| 338 |
|
| 339 |
articles: List[NewsArticleSchema] = Field(
|
| 340 |
...,
|
| 341 |
-
|
| 342 |
-
|
| 343 |
description="List of articles to validate"
|
| 344 |
)
|
| 345 |
|
|
@@ -360,7 +365,8 @@ class BatchValidationSchema(BaseModel):
|
|
| 360 |
description="Minimum quality score threshold"
|
| 361 |
)
|
| 362 |
|
| 363 |
-
@
|
|
|
|
| 364 |
def validate_article_list(cls, v):
|
| 365 |
"""Validate article list"""
|
| 366 |
if not v:
|
|
@@ -509,26 +515,27 @@ class BatchValidationResultSchema(BaseModel):
|
|
| 509 |
description="Validation summary statistics"
|
| 510 |
)
|
| 511 |
|
| 512 |
-
@
|
| 513 |
-
|
|
|
|
| 514 |
"""Validate article count consistency"""
|
| 515 |
-
if 'total_articles' in
|
| 516 |
-
total =
|
| 517 |
if v > total:
|
| 518 |
raise ValueError("Article count cannot exceed total")
|
| 519 |
return v
|
| 520 |
|
| 521 |
-
@
|
| 522 |
-
def validate_counts_consistency(
|
| 523 |
"""Validate count consistency"""
|
| 524 |
-
total =
|
| 525 |
-
valid =
|
| 526 |
-
invalid =
|
| 527 |
|
| 528 |
if valid + invalid != total:
|
| 529 |
raise ValueError("Valid + invalid articles must equal total articles")
|
| 530 |
|
| 531 |
-
return
|
| 532 |
|
| 533 |
@property
|
| 534 |
def success_rate(self) -> float:
|
|
|
|
| 1 |
+
# File: data/validation_schemas.py
|
| 2 |
# Comprehensive Pydantic validation schemas for data quality assurance
|
| 3 |
|
| 4 |
+
from pydantic import BaseModel, Field, field_validator, model_validator
|
| 5 |
from typing import List, Optional, Dict, Any, Union
|
| 6 |
from datetime import datetime
|
| 7 |
import re
|
|
|
|
| 53 |
description="The news article text content"
|
| 54 |
)
|
| 55 |
|
| 56 |
+
@field_validator('text')
|
| 57 |
+
@classmethod
|
| 58 |
def validate_text_content(cls, v):
|
| 59 |
"""Comprehensive text content validation"""
|
| 60 |
if not v or not isinstance(v, str):
|
|
|
|
| 156 |
description="Source reliability score (0-1)"
|
| 157 |
)
|
| 158 |
|
| 159 |
+
@field_validator('label')
|
| 160 |
+
@classmethod
|
| 161 |
def validate_label(cls, v):
|
| 162 |
"""Validate label value"""
|
| 163 |
if v not in [0, 1]:
|
|
|
|
| 190 |
description="Batch identifier for grouped data"
|
| 191 |
)
|
| 192 |
|
| 193 |
+
@field_validator('url')
|
| 194 |
+
@classmethod
|
| 195 |
def validate_url(cls, v):
|
| 196 |
"""Validate URL format"""
|
| 197 |
if v is not None:
|
|
|
|
| 247 |
description="Overall quality score (0-1)"
|
| 248 |
)
|
| 249 |
|
| 250 |
+
@field_validator('title')
|
| 251 |
+
@classmethod
|
| 252 |
def validate_title(cls, v):
|
| 253 |
"""Validate article title"""
|
| 254 |
if v is not None:
|
|
|
|
| 263 |
|
| 264 |
return v
|
| 265 |
|
| 266 |
+
@field_validator('language')
|
| 267 |
+
@classmethod
|
| 268 |
def validate_language(cls, v):
|
| 269 |
"""Validate language code"""
|
| 270 |
valid_languages = ['en', 'es', 'fr', 'de', 'it', 'pt', 'nl', 'ru', 'zh', 'ja', 'ko']
|
|
|
|
| 272 |
raise ValueError(f"Unsupported language code: {v}")
|
| 273 |
return v
|
| 274 |
|
| 275 |
+
@model_validator(mode='after')
|
| 276 |
+
def validate_article_consistency(self):
|
| 277 |
"""Cross-field validation"""
|
| 278 |
+
text_content = self.text_content
|
| 279 |
+
title = self.title
|
| 280 |
|
| 281 |
if text_content and title:
|
| 282 |
# Check if title and content are suspiciously similar
|
|
|
|
| 287 |
# This is fine, just noting high similarity
|
| 288 |
pass
|
| 289 |
|
| 290 |
+
return self
|
| 291 |
|
| 292 |
@property
|
| 293 |
def text_quality_level(self) -> TextQualityLevel:
|
|
|
|
| 343 |
|
| 344 |
articles: List[NewsArticleSchema] = Field(
|
| 345 |
...,
|
| 346 |
+
min_length=1,
|
| 347 |
+
max_length=10000,
|
| 348 |
description="List of articles to validate"
|
| 349 |
)
|
| 350 |
|
|
|
|
| 365 |
description="Minimum quality score threshold"
|
| 366 |
)
|
| 367 |
|
| 368 |
+
@field_validator('articles')
|
| 369 |
+
@classmethod
|
| 370 |
def validate_article_list(cls, v):
|
| 371 |
"""Validate article list"""
|
| 372 |
if not v:
|
|
|
|
| 515 |
description="Validation summary statistics"
|
| 516 |
)
|
| 517 |
|
| 518 |
+
@field_validator('valid_articles', 'invalid_articles')
|
| 519 |
+
@classmethod
|
| 520 |
+
def validate_article_counts(cls, v, info):
|
| 521 |
"""Validate article count consistency"""
|
| 522 |
+
if 'total_articles' in info.data:
|
| 523 |
+
total = info.data['total_articles']
|
| 524 |
if v > total:
|
| 525 |
raise ValueError("Article count cannot exceed total")
|
| 526 |
return v
|
| 527 |
|
| 528 |
+
@model_validator(mode='after')
|
| 529 |
+
def validate_counts_consistency(self):
|
| 530 |
"""Validate count consistency"""
|
| 531 |
+
total = self.total_articles
|
| 532 |
+
valid = self.valid_articles
|
| 533 |
+
invalid = self.invalid_articles
|
| 534 |
|
| 535 |
if valid + invalid != total:
|
| 536 |
raise ValueError("Valid + invalid articles must equal total articles")
|
| 537 |
|
| 538 |
+
return self
|
| 539 |
|
| 540 |
@property
|
| 541 |
def success_rate(self) -> float:
|