synthetic_data_generator

TemporalScope/src/temporalscope/datasets/synthetic_data_generator.py

This module provides utilities for generating synthetic time series data specifically for testing and validation purposes. While TemporalScope uses Narwhals for backend-agnostic operations, this generator serves as a defensive programming tool to ensure:

Runtime Testing: Generate test data across different DataFrame backends to verify behavior
Edge Case Coverage: Create data with nulls, NaNs, and various data types
Backend Validation: Test Narwhals operations with different DataFrame implementations

The generator creates consistent test data that matches the TimeFrame API's expected structure (see core_utils.py for data structure details). This helps maintain code stability by providing reliable test data that works across all supported backends.

Note: This module is primarily intended for testing purposes, not for production data generation.

FUNCTION	DESCRIPTION
`generate_synthetic_time_series`	Generate synthetic time series data with specified backend support and configurations.

ATTRIBUTE	DESCRIPTION
`RANDOM_SEED`

RANDOM_SEED

RANDOM_SEED = 100

generate_synthetic_time_series

generate_synthetic_time_series(
    backend: str,
    *,
    num_samples: int = 100,
    num_features: int = 3,
    with_nulls: bool = False,
    with_nans: bool = False,
    null_percentage: float = 0.05,
    nan_percentage: float = 0.05,
    mode: str = "single_target",
    time_col_numeric: bool = False,
    drop_time: bool = False,
    random_seed: int = RANDOM_SEED
) -> FrameT

Generate synthetic time series data with specified backend support and configurations.

PARAMETER	DESCRIPTION
`backend`	Backend to use for generated data (must be supported by Narwhals) TYPE: `str`
`num_samples`	Number of samples (rows) to generate, by default 100 TYPE: `int` DEFAULT: `100`
`num_features`	Number of feature columns to generate, by default 3 TYPE: `int` DEFAULT: `3`
`with_nulls`	Whether to introduce None values in feature columns, by default False TYPE: `bool` DEFAULT: `False`
`with_nans`	Whether to introduce NaN values in feature columns, by default False TYPE: `bool` DEFAULT: `False`
`null_percentage`	Percentage of rows to contain null values (0.0 to 1.0), by default 0.05 TYPE: `float` DEFAULT: `0.05`
`nan_percentage`	Percentage of rows to contain NaN values (0.0 to 1.0), by default 0.05 TYPE: `float` DEFAULT: `0.05`
`mode`	Mode for data generation, by default "single_target" TYPE: `str` DEFAULT: `'single_target'`
`time_col_numeric`	If True, time column is numeric instead of datetime, by default False TYPE: `bool` DEFAULT: `False`
`drop_time`	If True, time column is omitted from output, by default False TYPE: `bool` DEFAULT: `False`
`random_seed`	Seed for random number generation, by default RANDOM_SEED TYPE: `int` DEFAULT: `RANDOM_SEED`

RETURNS	DESCRIPTION
`FrameT`	Narwhals DataFrame containing generated synthetic data

RAISES	DESCRIPTION
`ValueError`	If backend not supported by Narwhals If invalid mode specified (only "single_target" supported) If invalid parameters (negative samples/features, invalid percentages)

Notes

For datasets with few rows, ensures at least one row has nulls/NaNs if enabled
For single-row datasets, nulls take precedence over NaNs if both enabled
Time column can be numeric (timestamps) or datetime based on time_col_numeric

Source code in src/temporalscope/datasets/synthetic_data_generator.py

@nw.narwhalify
def generate_synthetic_time_series(
    backend: str,
    *,  # Force keyword arguments for better readability
    num_samples: int = 100,
    num_features: int = 3,
    with_nulls: bool = False,
    with_nans: bool = False,
    null_percentage: float = 0.05,
    nan_percentage: float = 0.05,
    mode: str = "single_target",
    time_col_numeric: bool = False,
    drop_time: bool = False,
    random_seed: int = RANDOM_SEED,
) -> FrameT:
    """Generate synthetic time series data with specified backend support and configurations.

    Parameters
    ----------
    backend : str
        Backend to use for generated data (must be supported by Narwhals)
    num_samples : int, optional
        Number of samples (rows) to generate, by default 100
    num_features : int, optional
        Number of feature columns to generate, by default 3
    with_nulls : bool, optional
        Whether to introduce None values in feature columns, by default False
    with_nans : bool, optional
        Whether to introduce NaN values in feature columns, by default False
    null_percentage : float, optional
        Percentage of rows to contain null values (0.0 to 1.0), by default 0.05
    nan_percentage : float, optional
        Percentage of rows to contain NaN values (0.0 to 1.0), by default 0.05
    mode : str, optional
        Mode for data generation, by default "single_target"
    time_col_numeric : bool, optional
        If True, time column is numeric instead of datetime, by default False
    drop_time : bool, optional
        If True, time column is omitted from output, by default False
    random_seed : int, optional
        Seed for random number generation, by default RANDOM_SEED

    Returns
    -------
    FrameT
        Narwhals DataFrame containing generated synthetic data

    Raises
    ------
    ValueError
        If backend not supported by Narwhals
        If invalid mode specified (only "single_target" supported)
        If invalid parameters (negative samples/features, invalid percentages)

    Notes
    -----
    - For datasets with few rows, ensures at least one row has nulls/NaNs if enabled
    - For single-row datasets, nulls take precedence over NaNs if both enabled
    - Time column can be numeric (timestamps) or datetime based on time_col_numeric
    """
    _validate_synthetic_data_params(
        backend=backend,
        num_samples=num_samples,
        num_features=num_features,
        mode=mode,
        null_percentage=null_percentage,
        nan_percentage=nan_percentage,
    )

    np.random.seed(random_seed)

    # Generate DataFrame
    time_column = (
        np.arange(num_samples, dtype=np.float64)
        if time_col_numeric
        else pd.date_range("2023-01-01", periods=num_samples)
    )

    columns = {}
    if not drop_time:
        columns["time"] = time_column
    columns["target"] = np.random.rand(num_samples)
    for i in range(num_features):
        columns[f"feature_{i+1}"] = np.random.rand(num_samples)

    df = pd.DataFrame(columns)

    # Apply nulls/nans if needed
    feature_cols = [col for col in df.columns if col.startswith("feature_")]
    if feature_cols and (with_nulls or with_nans):
        if num_samples == 1:
            _apply_nulls_nans_single_row(df, feature_cols, with_nulls, with_nans)
        else:
            _apply_nulls_nans_multi_row(
                df, feature_cols, with_nulls, with_nans, null_percentage, nan_percentage, num_samples
            )

    # Convert to Narwhals DataFrame and transform
    df_nw = nw.from_native(df)

    # Following Pattern 1 from notebook: proper column selection with nw.col() and alias()
    result = df_nw.select(
        [
            # Time column if present
            *([nw.col("time").alias("time")] if not drop_time else []),
            # Target column (always present)
            nw.col("target").alias("target"),
            # Feature columns
            *[nw.col(f"feature_{i+1}").alias(f"feature_{i+1}") for i in range(num_features)],
        ]
    )

    # Convert to requested backend
    if backend.lower() != "pandas":
        # First wrap in Narwhals to use its conversion capabilities
        df_nw = nw.from_native(result)
        # Then convert to the target backend using the backend's native module
        if backend.lower() == "polars":
            import polars as pl

            result = pl.from_pandas(df_nw.to_native())
        else:
            # For other backends, let Narwhals handle the conversion
            result = df_nw.to_native()

    return result