Source code for statinf.data.GenerateData

import numpy as np
import pandas as pd

[docs]def generate_dataset(coeffs, n, std_dev, intercept=0., distribution='normal', binary=False, seed=None, **kwargs):
    """Generate an artificial dataset

    :param coeffs: List of coefficients to use for computing the ouytput variable.
    :type coeffs: :obj:`list`
    :param n: Number of observations to generate.
    :type n: :obj:`int`
    :param std_dev: Standard deviation of the distribution.
    :type std_dev: :obj:`list`
    :param intercept: Value of the intercept to be set, defaults to 0.
    :type intercept: :obj:`float`, optional
    :param distribution: Type of distribution to use for generating the input variables, defaults to 'normal'. Can be:

        * `normal`: :math:`X \\sim \\mathcal{N}(\\mu, \\sigma^{2})`
        * `unirform`: :math:`X \\sim \\mathcal{U}_{[\\text{low}, \\text{high}]}`

    :type distribution: :obj:`str`, optional
    :param binary: Define if output is binary, defaults to False.
    :type binary: :obj:`bool`, optional
    :param seed: Random seed, defaults to None.
    :type seed: :obj:`int`, optional

    :param \\*\\*kwargs: Arguments to be passed in the distribution function. Can be:

        * `normal`: :obj:`loc` = :math:`\\mu` and :obj:`scale` = :math:`\\sigma^{2}`
        * `uniform`: :obj:`low` and :obj:`high`

    :return: DataFrame with output variable named as :obj:`Y` and covariates as :obj:`X0`, :obj:`X1`, :obj:`X2`, ...
    :rtype: :obj:`pandas.DataFrame`
    """

    rdm = np.random.RandomState(seed) if seed else np.random
    # We calculate the number of predictors, and create a coefficient matrix
    # With `p` rows and 1 column, for matrix multiplication
    p = len(coeffs)
    params = pd.DataFrame({'coeff': coeffs, 'std_dev': std_dev})

    # Similar as before, but with `n` rows and `p` columns this time
    x = []
    for index, row in params.iterrows():
        if distribution.lower() == 'normal':
            x += [rdm.normal(size=n, **kwargs)]
        if distribution.lower() == 'uniform':
            x += [rdm.uniform(size=n, **kwargs)]
    X = np.array(x)
    e = rdm.normal(loc=0., scale=1., size=n)

    # Since x is a n*p matrix, and coefficients is a p*1 matrix
    # we can use matrix multiplication to get the value of y for each
    # set of values x1, x2 .. xp
    # We need to transpose it to get a 1*n array from a n*1 matrix to use in the regression model
    y = X.T.dot(coeffs) + e + intercept

    if binary:
        y = [1 if y_i > 0 else 0 for y_i in y]

    df = pd.DataFrame(X.T)
    # Assign column names
    for col in df.columns:
        df.rename(columns={col: 'X' + str(col)}, inplace=True)

    # Append Y
    df.loc[:, 'Y'] = y

    return df