Ahmedik95316 commited on
Commit
bfc4267
·
1 Parent(s): cecd6fa

Update data/prepare_datasets.py

Browse files
Files changed (1) hide show
  1. data/prepare_datasets.py +112 -47
data/prepare_datasets.py CHANGED
@@ -8,6 +8,9 @@ from sklearn.model_selection import train_test_split
8
  import hashlib
9
  import json
10
  from datetime import datetime
 
 
 
11
 
12
  # Configure logging
13
  logging.basicConfig(
@@ -131,7 +134,7 @@ class DatasetPreparer:
131
  ], ignore_index=True)
132
 
133
  logger.info(f"Combined Kaggle dataset: {len(df_combined)} samples")
134
- return df_combined
135
 
136
  except Exception as e:
137
  logger.error(f"Error loading Kaggle dataset: {e}")
@@ -201,7 +204,7 @@ class DatasetPreparer:
201
  if liar_dfs:
202
  combined_liar = pd.concat(liar_dfs, ignore_index=True)
203
  logger.info(f"Combined LIAR dataset: {len(combined_liar)} samples")
204
- return combined_liar
205
  else:
206
  logger.warning("No LIAR data could be processed")
207
  return None
@@ -226,7 +229,8 @@ class DatasetPreparer:
226
  # Validate text quality
227
  valid_mask = df['text'].apply(self.validate_text_quality)
228
  df = df[valid_mask]
229
- logger.info(f"Removed {initial_count - len(valid_mask.sum())} low-quality texts")
 
230
 
231
  # Remove duplicates
232
  before_dedup = len(df)
@@ -300,63 +304,124 @@ class DatasetPreparer:
300
  return float(np.mean(scores))
301
 
302
  def prepare_datasets(self) -> Tuple[bool, str]:
303
- """Main function to prepare and combine all datasets"""
 
 
304
  try:
305
- logger.info("Starting dataset preparation process...")
306
-
307
- datasets = []
308
-
309
- # Load Kaggle dataset
310
- kaggle_df = self.load_kaggle_dataset()
311
- if kaggle_df is not None:
312
- datasets.append(kaggle_df)
313
 
314
- # Load LIAR dataset
315
- liar_df = self.load_liar_dataset()
316
- if liar_df is not None:
317
- datasets.append(liar_df)
318
-
319
- # Check if we have any data
320
- if not datasets:
321
- error_msg = "No datasets could be loaded successfully"
322
- logger.error(error_msg)
323
- return False, error_msg
324
 
325
- # Combine all datasets
326
- logger.info("Combining all datasets...")
327
- combined_df = pd.concat(datasets, ignore_index=True)
 
 
 
 
 
328
 
329
- # Validate and clean the combined dataset
330
- validated_df = self.validate_dataset(combined_df)
331
 
332
- # Check minimum sample requirement
333
- if len(validated_df) < 100:
334
- error_msg = f"Insufficient samples after validation: {len(validated_df)}"
335
- logger.error(error_msg)
336
- return False, error_msg
337
 
338
- # Generate metadata
339
- metadata = self.generate_dataset_metadata(validated_df)
340
 
341
- # Save dataset
342
- validated_df.to_csv(self.output_path, index=False)
343
 
344
- # Save metadata
345
- with open(self.metadata_path, 'w') as f:
346
- json.dump(metadata, f, indent=2)
 
 
 
 
 
 
 
 
 
 
 
347
 
348
- logger.info(f"Dataset preparation complete!")
349
- logger.info(f"Final dataset: {len(validated_df)} samples")
350
- logger.info(f"Quality score: {metadata['quality_score']:.3f}")
351
- logger.info(f"Saved to: {self.output_path}")
352
 
353
- return True, f"Successfully prepared {len(validated_df)} samples"
 
354
 
355
  except Exception as e:
356
- error_msg = f"Dataset preparation failed: {str(e)}"
357
- logger.error(error_msg)
358
- return False, error_msg
359
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
360
  def main():
361
  """Main execution function"""
362
  preparer = DatasetPreparer()
 
8
  import hashlib
9
  import json
10
  from datetime import datetime
11
+ from data.data_validator import DataValidationPipeline
12
+ from data.validation_schemas import ValidationLevel, DataSource
13
+ from typing import Tuple, Dict
14
 
15
  # Configure logging
16
  logging.basicConfig(
 
134
  ], ignore_index=True)
135
 
136
  logger.info(f"Combined Kaggle dataset: {len(df_combined)} samples")
137
+ return self.validate_dataset_with_schemas(df_combined, 'kaggle_combined')
138
 
139
  except Exception as e:
140
  logger.error(f"Error loading Kaggle dataset: {e}")
 
204
  if liar_dfs:
205
  combined_liar = pd.concat(liar_dfs, ignore_index=True)
206
  logger.info(f"Combined LIAR dataset: {len(combined_liar)} samples")
207
+ return self.validate_dataset_with_schemas(combined_liar, 'liar_combined')
208
  else:
209
  logger.warning("No LIAR data could be processed")
210
  return None
 
229
  # Validate text quality
230
  valid_mask = df['text'].apply(self.validate_text_quality)
231
  df = df[valid_mask]
232
+ # logger.info(f"Removed {initial_count - len(valid_mask.sum())} low-quality texts")
233
+ logger.info(f"Removed {initial_count - valid_mask.sum()} low-quality texts")
234
 
235
  # Remove duplicates
236
  before_dedup = len(df)
 
304
  return float(np.mean(scores))
305
 
306
  def prepare_datasets(self) -> Tuple[bool, str]:
307
+ """Main method to prepare all datasets with validation"""
308
+ logger.info("Starting dataset preparation with validation...")
309
+
310
  try:
311
+ # Load and validate datasets
312
+ kaggle_result = self.load_kaggle_dataset()
313
+ liar_result = self.load_liar_dataset()
 
 
 
 
 
314
 
315
+ # Handle None returns gracefully
316
+ if kaggle_result is None:
317
+ logger.warning("Kaggle dataset loading failed")
318
+ kaggle_df, kaggle_validation = pd.DataFrame(), {
319
+ 'source': 'kaggle_combined', 'original_count': 0, 'valid_count': 0,
320
+ 'success_rate': 0, 'overall_quality_score': 0, 'validation_timestamp': datetime.now().isoformat()
321
+ }
322
+ else:
323
+ kaggle_df, kaggle_validation = kaggle_result
 
324
 
325
+ if liar_result is None:
326
+ logger.warning("LIAR dataset loading failed")
327
+ liar_df, liar_validation = pd.DataFrame(), {
328
+ 'source': 'liar_combined', 'original_count': 0, 'valid_count': 0,
329
+ 'success_rate': 0, 'overall_quality_score': 0, 'validation_timestamp': datetime.now().isoformat()
330
+ }
331
+ else:
332
+ liar_df, liar_validation = liar_result
333
 
334
+ # Combine datasets
335
+ datasets_to_combine = [df for df in [kaggle_df, liar_df] if not df.empty]
336
 
337
+ if not datasets_to_combine:
338
+ return False, "No datasets could be loaded and validated"
 
 
 
339
 
340
+ combined_df = pd.concat(datasets_to_combine, ignore_index=True)
 
341
 
342
+ # Save combined dataset
343
+ combined_df.to_csv(self.output_path, index=False)
344
 
345
+ # Save validation reports
346
+ total_original = kaggle_validation['original_count'] + liar_validation['original_count']
347
+ validation_report = {
348
+ 'datasets': {
349
+ 'kaggle': kaggle_validation,
350
+ 'liar': liar_validation
351
+ },
352
+ 'combined_stats': {
353
+ 'total_articles': len(combined_df),
354
+ 'total_original': total_original,
355
+ 'overall_success_rate': len(combined_df) / max(1, total_original),
356
+ 'validation_timestamp': datetime.now().isoformat()
357
+ }
358
+ }
359
 
360
+ validation_report_path = self.output_dir / "dataset_validation_report.json"
361
+ with open(validation_report_path, 'w') as f:
362
+ json.dump(validation_report, f, indent=2)
 
363
 
364
+ logger.info(f"Dataset preparation complete. Validation report saved to {validation_report_path}")
365
+ return True, f"Successfully prepared {len(combined_df)} validated articles"
366
 
367
  except Exception as e:
368
+ logger.error(f"Dataset preparation failed: {e}")
369
+ return False, f"Dataset preparation failed: {str(e)}"
 
370
 
371
+ def validate_dataset_with_schemas(self, df: pd.DataFrame, source_name: str) -> Tuple[pd.DataFrame, Dict]:
372
+ """Validate dataset using comprehensive schemas"""
373
+ logger.info(f"Starting schema validation for {source_name}...")
374
+
375
+ validator = DataValidationPipeline()
376
+
377
+ # Convert DataFrame to validation format
378
+ articles_data = []
379
+ for _, row in df.iterrows():
380
+ article_data = {
381
+ 'text': str(row.get('text', '')),
382
+ 'label': int(row.get('label', 0)),
383
+ 'source': source_name
384
+ }
385
+
386
+ if 'title' in row and pd.notna(row['title']):
387
+ article_data['title'] = str(row['title'])
388
+ if 'url' in row and pd.notna(row['url']):
389
+ article_data['url'] = str(row['url'])
390
+
391
+ articles_data.append(article_data)
392
+
393
+ # Perform batch validation
394
+ validation_result = validator.validate_batch(
395
+ articles_data,
396
+ batch_id=f"{source_name}_{datetime.now().strftime('%Y%m%d_%H%M%S')}",
397
+ validation_level=ValidationLevel.MODERATE
398
+ )
399
+
400
+ # Filter valid articles and add quality scores
401
+ valid_indices = [i for i, result in enumerate(validation_result.validation_results) if result.is_valid]
402
+
403
+ if valid_indices:
404
+ valid_df = df.iloc[valid_indices].copy()
405
+ quality_scores = [validation_result.validation_results[i].quality_metrics.get('overall_quality_score', 0.0)
406
+ for i in valid_indices]
407
+ valid_df['validation_quality_score'] = quality_scores
408
+ valid_df['validation_timestamp'] = datetime.now().isoformat()
409
+ else:
410
+ valid_df = pd.DataFrame(columns=df.columns)
411
+
412
+ validation_summary = {
413
+ 'source': source_name,
414
+ 'original_count': len(df),
415
+ 'valid_count': len(valid_df),
416
+ 'success_rate': validation_result.success_rate,
417
+ 'overall_quality_score': validation_result.overall_quality_score,
418
+ 'validation_timestamp': datetime.now().isoformat()
419
+ }
420
+
421
+ return valid_df, validation_summary
422
+
423
+
424
+
425
  def main():
426
  """Main execution function"""
427
  preparer = DatasetPreparer()