Skip to content

dataset_validator

TemporalScope/src/temporalscope/datasets/dataset_validator.py.

This module provides backend-agnostic dataset validation utilities based on research-backed heuristics. Using Narwhals operations, it enables consistent validation across different DataFrame backends while supporting domain-specific requirements through customizable thresholds.

CLASS DESCRIPTION
DatasetValidator

A validator for ensuring dataset quality using research-backed heuristics.

ValidationResult

Container for dataset validation results.

DatasetValidator

DatasetValidator(
    time_col: str,
    target_col: str,
    min_samples: int = 3000,
    max_samples: int = 50000,
    min_features: int = 4,
    max_features: int = 500,
    max_feature_ratio: float = 0.1,
    min_unique_values: int = 10,
    max_categorical_values: int = 20,
    class_imbalance_threshold: float = 1.5,
    checks_to_run: Optional[List[str]] = None,
    enable_warnings: bool = True,
)

A validator for ensuring dataset quality using research-backed heuristics.

METHOD DESCRIPTION
fit

Validate input DataFrame and prepare for validation checks.

fit_transform

Fit the validator and run validation checks in one step.

print_report

Print validation results in a tabular format.

transform

Run configured validation checks on the DataFrame.

ATTRIBUTE DESCRIPTION
AVAILABLE_CHECKS

checks_to_run

class_imbalance_threshold

enable_warnings

max_categorical_values

max_feature_ratio

max_features

max_samples

min_features

min_samples

min_unique_values

target_col

time_col

Source code in src/temporalscope/datasets/dataset_validator.py
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
def __init__(
    self,
    time_col: str,
    target_col: str,
    min_samples: int = 3000,
    max_samples: int = 50000,
    min_features: int = 4,
    max_features: int = 500,
    max_feature_ratio: float = 0.1,
    min_unique_values: int = 10,
    max_categorical_values: int = 20,
    class_imbalance_threshold: float = 1.5,
    checks_to_run: Optional[List[str]] = None,
    enable_warnings: bool = True,
):
    """Initialize the validator with column configuration and thresholds."""
    self.time_col = time_col
    self.target_col = target_col
    self.min_samples = min_samples
    self.max_samples = max_samples
    self.min_features = min_features
    self.max_features = max_features
    self.max_feature_ratio = max_feature_ratio
    self.min_unique_values = min_unique_values
    self.max_categorical_values = max_categorical_values
    self.class_imbalance_threshold = class_imbalance_threshold
    self.enable_warnings = enable_warnings

    # Validate and store checks to run
    if checks_to_run:
        invalid_checks = set(checks_to_run) - self.AVAILABLE_CHECKS
        if invalid_checks:
            raise ValueError(f"Invalid checks: {invalid_checks}")
        self.checks_to_run = set(checks_to_run)
    else:
        self.checks_to_run = self.AVAILABLE_CHECKS

AVAILABLE_CHECKS

AVAILABLE_CHECKS = {
    "sample_size",
    "feature_count",
    "feature_ratio",
    "feature_variability",
    "categorical_cardinality",
    "class_balance",
    "binary_features",
}

checks_to_run

checks_to_run = set(checks_to_run)

class_imbalance_threshold

class_imbalance_threshold = class_imbalance_threshold

enable_warnings

enable_warnings = enable_warnings

max_categorical_values

max_categorical_values = max_categorical_values

max_feature_ratio

max_feature_ratio = max_feature_ratio

max_features

max_features = max_features

max_samples

max_samples = max_samples

min_features

min_features = min_features

min_samples

min_samples = min_samples

min_unique_values

min_unique_values = min_unique_values

target_col

target_col = target_col

time_col

time_col = time_col

fit

fit(df: Union[Any, FrameT]) -> DatasetValidator

Validate input DataFrame and prepare for validation checks.

Source code in src/temporalscope/datasets/dataset_validator.py
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
def fit(self, df: Union[Any, FrameT]) -> "DatasetValidator":
    """Validate input DataFrame and prepare for validation checks."""
    # Convert to Narwhals DataFrame
    df = self._ensure_narwhals_df(df)

    # Validate required columns exist
    if self.time_col not in df.columns or self.target_col not in df.columns:
        raise ValueError(f"Columns {self.time_col} and {self.target_col} must exist")

    # Validate numeric columns
    for col in df.columns:
        if col != self.time_col:
            try:
                df.select([nw.col(col).cast(nw.Float64)])
            except Exception as e:
                raise ValueError(f"Column {col} must be numeric. Error: {str(e)}")

    # Check nulls
    null_counts = {}
    for col in df.columns:
        null_count = df.select([nw.col(col).is_null().sum().cast(nw.Int64).alias("nulls")])
        if hasattr(null_count, "collect"):
            null_count = null_count.collect()
        value = null_count["nulls"][0]
        if hasattr(value, "as_py"):
            value = value.as_py()
        null_counts[col] = int(value)

    # Raise error if any nulls found
    null_columns = [col for col, count in null_counts.items() if count > 0]
    if null_columns:
        raise ValueError(f"Missing values detected in columns: {', '.join(null_columns)}")

    return self

fit_transform

fit_transform(
    df: Union[Any, FrameT], target_col: Optional[str] = None
) -> Dict[str, ValidationResult]

Fit the validator and run validation checks in one step.

Source code in src/temporalscope/datasets/dataset_validator.py
433
434
435
def fit_transform(self, df: Union[Any, FrameT], target_col: Optional[str] = None) -> Dict[str, ValidationResult]:
    """Fit the validator and run validation checks in one step."""
    return self.fit(df).transform(df, target_col)

print_report

print_report(results: Dict[str, ValidationResult]) -> None

Print validation results in a tabular format.

Source code in src/temporalscope/datasets/dataset_validator.py
437
438
439
440
441
442
443
444
445
446
447
448
def print_report(self, results: Dict[str, ValidationResult]) -> None:
    """Print validation results in a tabular format."""
    rows = []
    for check_name, result in results.items():
        status = "✓" if result.passed else "✗"
        message = result.message or "Check passed"
        details = ", ".join(f"{k}: {v}" for k, v in (result.details or {}).items())
        rows.append([check_name, status, message, details])

    print("\nDataset Validation Report")
    print(tabulate(rows, headers=["Check", "Status", "Message", "Details"], tablefmt="grid"))
    print("\nNote: These are research-backed recommendations and may not apply to all use cases.")

transform

transform(
    df: FrameT, target_col: Optional[str] = None
) -> Dict[str, ValidationResult]

Run configured validation checks on the DataFrame.

Source code in src/temporalscope/datasets/dataset_validator.py
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
@nw.narwhalify
def transform(self, df: FrameT, target_col: Optional[str] = None) -> Dict[str, ValidationResult]:
    """Run configured validation checks on the DataFrame."""
    # Execute validation checks
    results = {}
    check_names = ["sample_size", "feature_count", "feature_ratio", "feature_variability", "class_balance"]

    for check_name in check_names:
        result = self._execute_check(check_name, df, target_col)
        if result is not None:
            results[check_name] = result

    # Summarize results
    all_passed = all(result.passed for result in results.values())
    if not all_passed:
        critical_failures = any(result.severity == "ERROR" for result in results.values() if not result.passed)
        if critical_failures and self.enable_warnings:
            warnings.warn(
                "Critical validation checks failed. These failures may significantly impact model performance.",
                RuntimeWarning,
            )
        elif self.enable_warnings:
            warnings.warn(
                "Some validation checks failed. These are research-backed recommendations "
                "and may not apply to all use cases. Adjust thresholds as needed.",
                UserWarning,
            )

    return results

ValidationResult

ValidationResult(
    passed: bool,
    message: Optional[str] = None,
    details: Optional[Dict[str, Any]] = None,
    severity: Optional[str] = None,
)

Container for dataset validation results.

METHOD DESCRIPTION
get_failed_checks

Get all failed validation checks.

get_validation_summary

Get summary statistics.

to_dict

Convert result to dictionary for serialization.

to_log_entry

Format result as a structured log entry.

ATTRIBUTE DESCRIPTION
details

TYPE: Optional[Dict[str, Any]]

message

TYPE: Optional[str]

passed

TYPE: bool

severity

TYPE: Optional[str]

details

details: Optional[Dict[str, Any]] = None

message

message: Optional[str] = None

passed

passed: bool

severity

severity: Optional[str] = None

get_failed_checks

get_failed_checks(
    results: Dict[str, ValidationResult]
) -> Dict[str, ValidationResult]

Get all failed validation checks.

Source code in src/temporalscope/datasets/dataset_validator.py
57
58
59
60
@classmethod
def get_failed_checks(cls, results: Dict[str, "ValidationResult"]) -> Dict[str, "ValidationResult"]:
    """Get all failed validation checks."""
    return {name: result for name, result in results.items() if not result.passed}

get_validation_summary

get_validation_summary(
    results: Dict[str, ValidationResult]
) -> Dict[str, Any]

Get summary statistics.

Source code in src/temporalscope/datasets/dataset_validator.py
62
63
64
65
66
67
68
69
70
@classmethod
def get_validation_summary(cls, results: Dict[str, "ValidationResult"]) -> Dict[str, Any]:
    """Get summary statistics."""
    return {
        "total_checks": len(results),
        "passed_checks": sum(1 for r in results.values() if r.passed),
        "failed_checks": sum(1 for r in results.values() if not r.passed),
        "check_details": {name: result.to_dict() for name, result in results.items()},
    }

to_dict

to_dict() -> Dict[str, Any]

Convert result to dictionary for serialization.

Source code in src/temporalscope/datasets/dataset_validator.py
44
45
46
def to_dict(self) -> Dict[str, Any]:
    """Convert result to dictionary for serialization."""
    return {"passed": self.passed, "message": self.message, "details": self.details, "severity": self.severity}

to_log_entry

to_log_entry() -> Dict[str, Any]

Format result as a structured log entry.

Source code in src/temporalscope/datasets/dataset_validator.py
48
49
50
51
52
53
54
55
def to_log_entry(self) -> Dict[str, Any]:
    """Format result as a structured log entry."""
    return {
        "validation_passed": self.passed,
        "validation_message": self.message,
        "validation_details": self.details,
        "log_level": self.severity or ("INFO" if self.passed else "WARNING"),
    }