import numpy as np
import pandas as pd
[docs]class DataGeneratingProcess():
DEFAULT_PERCENTILE = 0.9
def __init__(self, **kwargs):
'''
Base class for implementation of data generating process.
Subclasses implement functions that create various data generating processes. All data generating processes are in the package "dowhy.utils.dgps".
'''
self.treatment = kwargs['treatment']
self.outcome = kwargs['outcome']
self.confounder = kwargs['confounder']
self.effect_modifier = kwargs['effect_modifier']
self.weights = kwargs.pop('weights',{})
self.bias = kwargs.pop('bias',{})
self.seed = kwargs.pop('seed',None)
self.treatment_is_binary = kwargs.pop('treatment_is_binary', False)
if self.treatment_is_binary:
self.percentile = kwargs.pop('percentile', DataGeneratingProcess.DEFAULT_PERCENTILE)
elif kwargs.pop('percentile', None) is not None:
raise ValueError('Cannot use percentile, if the input is non-binary')
else:
self.percentile = "NA"
self.true_value = None
if self.seed is not None:
np.random.seed(self.seed)
[docs] def generate_data(self):
raise NotImplementedError()
[docs] def generation_process(self):
raise NotImplementedError()
[docs] def convert_to_binary(self, data, deterministic=False):
if deterministic:
precentile = np.percentile(data, self.percentile, axis=0)
binary_treat_value = data <= precentile
else:
temp = data.argsort(axis=0)
ranks = np.empty_like(temp)
ranks[temp[:,0],0] = np.arange(data.shape[0])
prob_t = ranks/data.shape[0]
# Generating data with equal 0 and 1 (since ranks are uniformly distributed)
binary_treat_value = np.random.binomial(1, prob_t[:,0], data.shape[0])
# Flipping some values
if self.percentile >=0.5:
mask = np.random.binomial(1, (1-self.percentile)*2, len(binary_treat_value[binary_treat_value==1]))
binary_treat_value[binary_treat_value==1] = mask * binary_treat_value[binary_treat_value==1]
else:
mask = np.random.binomial(1, 1- self.percentile*2, len(binary_treat_value[binary_treat_value==0]))
binary_treat_value[binary_treat_value==0] = mask + binary_treat_value[binary_treat_value==0]
binary_treat_value = binary_treat_value[:,np.newaxis]
return binary_treat_value.astype(float)
def __str__(self):
rep = """
treatment:{}
outcome:{}
confounder: {}
effect_modifier: {}
weights: {}
bias: {}
seed: {}
treatment_is_binary: {}
percentile: {}
""".format(self.treatment, self.outcome, self.confounder, self.effect_modifier, self.weights, self.bias, self.seed, self.treatment_is_binary, self.percentile)
return rep