import matplotlib.pyplot as plt from matplotlib import pyplot from sklearn.datasets import make_regression from sklearn.preprocessing import MinMaxScaler import sklearn import numpy as np from scipy.stats import pearsonr import csv import numpy as np import pandas as pd import json import random from data_exporter import DataExporter from data_states_config import StatesDataConfig class CorrelatedDataGenerator: def __init__(self, config): # Geo temporal estructure self.geotemp_config = config # Correlation info self.r = 0 self.monotonic = False self.type_monotonic = None #1 indicates increasing, 0 indicates decreasing self.r_exception = 0 self.monotonic_exception = False self.type_monotonic_exception = None # Min and max values for each variable self.x_start_interval = [20, 30] self.x_end_interval = [75, 85] self.y_start_interval = [25, 35] self.y_end_interval = [40, 50] # Exporter path self.path = 'data/' def increase_ranges(self): self.x_start_interval = [5, 50] self.x_end_interval = [51, 95] self.y_start_interval = [10, 40] self.y_end_interval = [41, 55] def normal_ranges(self): self.x_start_interval = [20, 30] self.x_end_interval = [75, 85] self.y_start_interval = [25, 35] self.y_end_interval = [40, 50] def get_noise(self, r): if abs(r) >= 0.8: return 10 else: return 100 def generate_correlated_data(self, size, r, monotonic=False, type_monotonic=None, scale_x=None, scale_y=None): noise = self.get_noise(r) x = np.zeros(size) y = np.zeros(size) ######### GENERATE DATA UNITL DESIRED PEARSON R ########## while True: ######### GENERATE REGRESSION ######### x_tmp, y_tmp = make_regression(n_samples=size, n_features=1, n_informative=1, noise=noise) x_tmp = (x_tmp.T)[0] ######### SCALE ######### if scale_x is None: x_start = np.random.randint(self.x_start_interval[0], self.x_start_interval[1]) x_end = np.random.randint(self.x_end_interval[0], self.x_end_interval[1]) scale_x = [x_start, x_end] if scale_y is None: y_start = np.random.randint(self.y_start_interval[0], self.y_start_interval[1]) y_end = np.random.randint(self.y_end_interval[0], self.y_end_interval[1]) scale_y = [y_start, y_end] x_scaled = sklearn.preprocessing.minmax_scale(x_tmp, feature_range=(scale_x[0], scale_x[1])) y_scaled = sklearn.preprocessing.minmax_scale(y_tmp, feature_range=(scale_y[0], scale_y[1])) for i in range(0, size): x[i] = x_scaled[i] y[i] = y_scaled[i] ######## MAKE PERFECT CORRELATION ############### if abs(r) == 0.8: x.sort() y.sort() if r == -0.8: y = y[::-1] else: if monotonic: indices = x_scaled.argsort() for i, index in enumerate(indices): x[i] = x_scaled[index] y[i] = y_scaled[index] if not monotonic: indices = list(range(0, size)) np.random.shuffle(indices) x_tmp = x.copy() y_tmp = y.copy() for i in range(0, size): x[i] = x_tmp[indices[i]] y[i] = y_tmp[indices[i]] if type_monotonic == -1: x = x[::-1] y = y[::-1] ######### BREAK IF DESIRED COEF ######### # TODO: make this more generic coef = pearsonr(x, y) if r == 0.2 and abs(coef[0]) <= 0.2: break if r == 0.8 and coef[0] >= 0.95: break if r == -0.8 and coef[0] <= -0.95: break ######### RETURN ######### return (x, y, coef) def generate_geotemp_data(self): # Type monotonic can be increasing (1) or decreasing (-1) ######### GET TEMP INFO FROM CONFIG ######### times_total = self.geotemp_config.times_total ######### GET DATAFRAME OF LOCATIONS ######### df = self.geotemp_config.get_dataframe() ######### ITERATE THROUGH LOCATIONS ######### for index, row in df.iterrows(): if row['state'] == 'notnoise_time': continue ######### GENERATE AND ADD DATA ######### is_notnoise = df.loc[index, 'notnoise_geo'] x, y, coef = [None, None, None] if is_notnoise == 1: x, y, coef = self.generate_correlated_data(times_total, self.r, self.monotonic, self.type_monotonic) else: x, y, coef = self.generate_correlated_data(times_total, self.r_exception, self.monotonic_exception, self.type_monotonic_exception) for i in range(1, times_total + 1): col_name = 'x_' + str(i) df.loc[index, col_name] = x[i-1] col_name = 'y_' + str(i) df.loc[index, col_name] = y[i-1] df.loc[index, 'pearson_r'] = coef[0] ######### EXCECUTE TIME EXCEPTIONS ######### self.add_time_noise(df) self.force_one_time(df) self.force_time_range(df) ######### RE COMPUTE R ######### self.compute_r(df) ######### EXPORT DATA TO FILES ######### exporter = DataExporter(self.path) exporter.export(df, self.r, self.monotonic, self.type_monotonic, self.r_exception, self.monotonic_exception, self.type_monotonic_exception, self.geotemp_config) return df def add_time_noise(self, df): times_total = self.geotemp_config.times_total for index, row in df.iterrows(): if row['state'] == 'notnoise_time': continue is_notnoise = df.loc[index, 'notnoise_geo'] if is_notnoise == 1: # get scale_x x_end = 'x_' + str(times_total) x_current = np.array(row.loc['x_1':x_end]) x_min = np.min(x_current) x_max = np.max(x_current) # get scale_y y_end = 'y_' + str(times_total) y_current = np.array(row.loc['y_1':y_end]) y_min = np.min(y_current) y_max = np.max(y_current) # Get new data x, y, coef = self.generate_correlated_data(times_total, self.r_exception, self.monotonic_exception, self.type_monotonic_exception, [x_min, x_max], [y_min, y_max]) # Assign notnoise = np.array(df.loc[len(df)-1, 'x_1':x_end].copy()) for notnoise_index, notnoise_value in enumerate(notnoise): if notnoise_value == 0: col_name = 'x_' + str(notnoise_index + 1) df.loc[index, col_name] = x[notnoise_index] col_name = 'y_' + str(notnoise_index + 1) df.loc[index, col_name] = y[notnoise_index] def force_one_time(self, df): if self.geotemp_config.time_forced is None: return 0 locations_noise = round(self.geotemp_config.locations_total*self.geotemp_config.locations_noise_perc) locations_notnoise = self.geotemp_config.locations_total - locations_noise n_times = self.geotemp_config.times_total current_conf = df.loc[len(df)-1, 'x_1':'x_'+str(n_times)] indices_notnoise = np.where(current_conf == 1)[0] #self.increase_ranges() for notnoise_index in indices_notnoise: time = notnoise_index + 1 x, y, coef = self.generate_correlated_data(locations_notnoise, self.r, False, None) state_index = 0 for index, row in df.iterrows(): if row['state'] == 'notnoise_time': continue if row['notnoise_geo'] == 1: x_name = 'x_' + str(time) y_name = 'y_' + str(time) df.loc[index, x_name] = x[state_index] df.loc[index, y_name] = y[state_index] state_index += 1 #self.normal_ranges() def force_time_range(self, df): if self.geotemp_config.time_range_forced is None: return 0 locations_noise = round(self.geotemp_config.locations_total*self.geotemp_config.locations_noise_perc) locations_notnoise = self.geotemp_config.locations_total - locations_noise start = self.geotemp_config.time_range_forced[0] end = self.geotemp_config.time_range_forced[1] n_times = end - start + 1 for index, row in df.iterrows(): if row['state'] == 'notnoise_time': continue if row['notnoise_geo'] == 1: x, y, coef = self.generate_correlated_data(n_times, self.r, self.monotonic, self.type_monotonic) for i in range(0, n_times): x_name = 'x_' + str(start + i) y_name = 'y_' + str(start + i) df.loc[index, x_name] = x[i] df.loc[index, y_name] = y[i] #self.normal_ranges() def shuffle_all_locations(self, df, location_forced=None): orginal_df = df.copy() indices = np.array(df.index.tolist()) indices = indices[:-1] np.random.shuffle(indices) for index, row in df.iterrows(): if row['state'] != 'notnoise_time': df.loc[index, 'x_1':'notnoise_geo'] = orginal_df.loc[indices[index],'x_1':'notnoise_geo'] if location_forced is not None: state_index = df[df['state'] == location_forced].index.values.astype(int)[0] if df.loc[state_index, 'notnoise_geo'] != 1: indices_notnoise = np.array(df[df['notnoise_geo'] == 1].index.tolist()) swap_index = np.random.randint(0, len(indices_notnoise) - 1) swap_index = indices_notnoise[swap_index] tmp = df.loc[swap_index, 'x_1':'notnoise_geo'] df.loc[swap_index, 'x_1':'notnoise_geo'] = df.loc[state_index, 'x_1':'notnoise_geo'] df.loc[state_index, 'x_1':'notnoise_geo'] = tmp def shuffle_location_range(self, df, location_range_forced): orginal_df = df.copy() ######### SHUFFLE INSIDE LOCATION RANGE ######### indices = np.array(df[(df[location_range_forced] == 1) & (df['state'] != 'notnoise_time')].index.tolist()) np.random.shuffle(indices) n = 0 for index, row in df.iterrows(): if row['state'] != 'notnoise_time' and row[location_range_forced] == 1: df.loc[index, 'x_1':'notnoise_geo'] = orginal_df.loc[indices[n],'x_1':'notnoise_geo'] n += 1 ######### SHUFFLE OUTSIDE LOCATION RANGE ######### indices = np.array(df[(df[location_range_forced] == 0) & (df['state'] != 'notnoise_time')].index.tolist()) np.random.shuffle(indices) n = 0 for index, row in df.iterrows(): if row['state'] != 'notnoise_time' and row[location_range_forced] == 0: df.loc[index, 'x_1':'notnoise_geo'] = orginal_df.loc[indices[n],'x_1':'notnoise_geo'] n += 1 def shuffle_times(self, df, n_times, time_forced=None, monotony_forced=False): original_df = df.copy() indices = np.array(range(1, n_times + 1)) if monotony_forced: indices = indices[::-1] else: np.random.shuffle(indices) for index, value in enumerate(indices): df['x_' + str(index+1)] = original_df['x_' + str(value)] df['y_' + str(index+1)] = original_df['y_' + str(value)] if time_forced is not None: if df.loc[len(df)-1, 'x_' + str(time_forced)] != 1: current_conf = df.loc[len(df)-1, 'x_1':'x_'+str(n_times)] indices_notnoise = np.where(current_conf == 1)[0] np.random.shuffle(indices_notnoise) swap_index = indices_notnoise[0] # just choose first tmp_x = df['x_' + str(swap_index+1)].copy() tmp_y = df['y_' + str(swap_index+1)].copy() df['x_' + str(swap_index+1)] = df['x_'+str(time_forced)] df['y_' + str(swap_index+1)] = df['y_'+str(time_forced)] df['x_' + str(time_forced)] = tmp_x df['y_' + str(time_forced)] = tmp_y def shuffle_time_range(self, df, n_times, time_range_forced, monotony_forced=False): original_df = df.copy() ######### SHUFFLE INSIDE TIME RANGE ######### indices = np.array(range(time_range_forced[0], time_range_forced[1] + 1)) if monotony_forced: indices = indices[::-1] else: np.random.shuffle(indices) for index, value in enumerate(indices): df['x_' + str(time_range_forced[0]+index)] = original_df['x_' + str(value)] df['y_' + str(time_range_forced[0]+index)] = original_df['y_' + str(value)] ######### SHUFFLE OUTSIDE TIME RANGE ######### all_indices = np.array(range(1, n_times + 1)) indices = np.array(range(time_range_forced[0], time_range_forced[1] + 1)) indices_ori = np.setdiff1d(all_indices, indices) indices = np.setdiff1d(all_indices, indices) if monotony_forced: indices = indices[::-1] else: np.random.shuffle(indices) for index, value in enumerate(indices): df['x_' + str(indices_ori[index])] = original_df['x_' + str(value)] df['y_' + str(indices_ori[index])] = original_df['y_' + str(value)] def compute_r(self, data): result = [] if self.geotemp_config.time_forced is not None: x = [] y = [] for index, row in data.iterrows(): if row['notnoise_geo'] == 1: name = 'x_' + str(self.geotemp_config.time_forced) x.append(row[name]) name = 'y_' + str(self.geotemp_config.time_forced) y.append(row[name]) coef = pearsonr(x, y) for index, row in data.iterrows(): if row['notnoise_geo'] == 1: data.loc[index, 'pearson_r'] = coef[0] result.append(coef) elif self.geotemp_config.time_range_forced is not None: for index, row in data.iterrows(): if row['notnoise_geo'] == 1: x_start = 'x_' + str(self.geotemp_config.time_range_forced[0]) x_end = 'x_' + str(self.geotemp_config.time_range_forced[1]) x = data.loc[index, x_start:x_end] y_start = 'y_' + str(self.geotemp_config.time_range_forced[0]) y_end = 'y_' + str(self.geotemp_config.time_range_forced[1]) y = data.loc[index, y_start:y_end] coef = pearsonr(x, y) data.loc[index, 'pearson_r'] = coef[0] result.append(coef) else: for index, row in data.iterrows(): if row['notnoise_geo'] == 1: x_start = 'x_1' x_end = 'x_' + str(self.geotemp_config.times_total) x = data.loc[index, x_start:x_end] y_start = 'y_1' y_end = 'y_' + str(self.geotemp_config.times_total) y = data.loc[index, y_start:y_end] coef = pearsonr(x, y) data.loc[index, 'pearson_r'] = coef[0] result.append(coef) return result def get_min_max_r(self, df): r_min = 2 r_max = -2 for index, row in df.iterrows(): if row['notnoise_geo'] == 1: r_min = min(r_min, row['pearson_r']) r_max = max(r_max, row['pearson_r']) return (r_min, r_max)