This code generates 1000 samples of data, with each sample consisting of 10 input variables (x1, x2, ..., x10) and one output variable (y). The input variables are generated using different probability distributions, as indicated in the comments. The output variable is calculated using a true model with no intercept term.
see source
SAS
data Simulate;
do i = 1 to 1000;
x1 = 10 + 5*rannor(0); * Normal(10, 25);
x2 = exp(3*rannor(0)); * lognormal;
x3 = 5 + 10*ranuni(0); * uniform;
x4 = 100 + 50*rannor(0); * Normal(100, 2500);
x5 = x1 + 3*rannor(0); * normal bimodal;
x6 = 2*x2 + ranexp(0); * lognormal and exponential mixture;
x7 = 0.5*exp(4*rannor(0)); * lognormal;
x8 = 10 + 8*ranuni(0); * uniform;
x9 = x2 + x8 + 2*rannor(0); * lognormal, uniform and normal mix;
x10 = 200 + 90*rannor(0); * normal(200, 8100);
y = 3*x2 - 4*x8 + 5*x9 + 3*rannor(0); * true model with no intercept term;
output;
end;
do i = 1 to 1000;
x1 = 10 + 5*rannor(0); * Normal(10, 25);
x2 = exp(3*rannor(0)); * lognormal;
x3 = 5 + 10*ranuni(0); * uniform;
x4 = 100 + 50*rannor(0); * Normal(100, 2500);
x5 = x1 + 3*rannor(0); * normal bimodal;
x6 = 2*x2 + ranexp(0); * lognormal and exponential mixture;
x7 = 0.5*exp(4*rannor(0)); * lognormal;
x8 = 10 + 8*ranuni(0); * uniform;
x9 = x2 + x8 + 2*rannor(0); * lognormal, uniform and normal mix;
x10 = 200 + 90*rannor(0); * normal(200, 8100);
y = 3*x2 - 4*x8 + 5*x9 + 3*rannor(0); * true model with no intercept term;
output;
end;
Python
import pandas as pd
import numpy as np
# Create an empty dataframe
df = pd.DataFrame()
# Generate 1000 samples of data
for i in range(1000):
x1 = 10 + 5 * np.random.normal() # Normal(10, 25)
x2 = np.exp(3 * np.random.normal()) # lognormal
x3 = 5 + 10 * np.random.uniform() # uniform
x4 = 100 + 50 * np.random.normal() # Normal(100, 2500)
x5 = x1 + 3 * np.random.normal() # normal bimodal
x6 = 2 * x2 + np.random.exponential() # lognormal and exponential mixture
x7 = 0.5 * np.exp(4 * np.random.normal()) # lognormal
x8 = 10 + 8 * np.random.uniform() # uniform
x9 = x2 + x8 + 2 * np.random.normal() # lognormal, uniform and normal mix
x10 = 200 + 90 * np.random.normal() # normal(200, 8100)
y = 3 * x2 - 4 * x8 + 5 * x9 + 3 * np.random.normal() # true model with no intercept term
# Add the generated values to the dataframe
df = df.append({'x1': x1, 'x2': x2, 'x3': x3, 'x4': x4, 'x5': x5, 'x6': x6, 'x7': x7, 'x8': x8, 'x9': x9, 'x10': x10, 'y': y}, ignore_index=True)
import numpy as np
# Create an empty dataframe
df = pd.DataFrame()
# Generate 1000 samples of data
for i in range(1000):
x1 = 10 + 5 * np.random.normal() # Normal(10, 25)
x2 = np.exp(3 * np.random.normal()) # lognormal
x3 = 5 + 10 * np.random.uniform() # uniform
x4 = 100 + 50 * np.random.normal() # Normal(100, 2500)
x5 = x1 + 3 * np.random.normal() # normal bimodal
x6 = 2 * x2 + np.random.exponential() # lognormal and exponential mixture
x7 = 0.5 * np.exp(4 * np.random.normal()) # lognormal
x8 = 10 + 8 * np.random.uniform() # uniform
x9 = x2 + x8 + 2 * np.random.normal() # lognormal, uniform and normal mix
x10 = 200 + 90 * np.random.normal() # normal(200, 8100)
y = 3 * x2 - 4 * x8 + 5 * x9 + 3 * np.random.normal() # true model with no intercept term
# Add the generated values to the dataframe
df = df.append({'x1': x1, 'x2': x2, 'x3': x3, 'x4': x4, 'x5': x5, 'x6': x6, 'x7': x7, 'x8': x8, 'x9': x9, 'x10': x10, 'y': y}, ignore_index=True)