import matplotlib.pyplot as plt from matplotlib import pyplot from sklearn.datasets import make_regression from sklearn.preprocessing import MinMaxScaler import sklearn import numpy as np from scipy.stats import pearsonr import csv import numpy as np import pandas as pd import json import random class StatesDataConfig: ## 1 -> indicate not noise. Ie, it will follow the appied correlation. ## 0 -> indicate noise. def __init__(self): # REGARDING SPACE self.locations_total = 48 self.locations_noise_perc = 0 self.location_forced = None self.location_range_forced = None #REGARDIING TIME self.times_total = 9 self.times_noise_perc = 0 self.time_forced = None self.time_range_forced = None ########## GEOGRAPHICAL TRANSFORMATIONS ############## def assign_location_ranges(self, df): ######### NORTH ######### df.sort_values(by='y', ascending=False, inplace=True) df['N'] = self.get_location_range_indicators(26) # Exceptions: i = self.get_location_index(df, 'NV') df.loc[i, 'N'] = 0 i = self.get_location_index(df, 'DE') df.loc[i, 'N'] = 1 ######### SOUTH ######### df.sort_values(by='y', ascending=True, inplace=True) df['S'] = self.get_location_range_indicators(22) # Exceptions: i = self.get_location_index(df, 'NV') df.loc[i, 'S'] = 1 i = self.get_location_index(df, 'DE') df.loc[i, 'S'] = 0 ######### WEST ######### df.sort_values(by='x', ascending=True, inplace=True) df['W'] = self.get_location_range_indicators(17) ######### EAST ######### df.sort_values(by='x', ascending=False, inplace=True) df['E'] = self.get_location_range_indicators(31) df.sort_index(inplace=True) def get_location_index(self, df, location_code): return df[df['state'] == location_code].index.values.astype(int)[0] def get_location_range_indicators(self, n): n_ones = n n_zeros = self.locations_total - n ones = np.ones(n_ones) zeros = np.zeros(n_zeros) belongs = np.concatenate([ones, zeros]) return belongs def add_extra_geographical_notnoise(self, df): locations_noise = round(self.locations_total*self.locations_noise_perc) locations_notnoise = self.locations_total - locations_noise current_notnoise = np.array(df['notnoise_geo']) indices_notnoise = np.where(current_notnoise == 1)[0] notnoise_difference = locations_notnoise - len(indices_notnoise) if notnoise_difference > 0: indices_noise = np.where(current_notnoise == 0)[0] np.random.shuffle(indices_noise) for i in range(0, notnoise_difference): current_notnoise[indices_noise[i]] = 1 df['notnoise_geo'] = current_notnoise ############### TEMPORAL TRANSFORMATION ################# def add_extra_time_notnoise(self, df): times_noise = round(self.times_total*self.times_noise_perc) times_notnoise = self.times_total - times_noise var_name_end = 'x_' + str(self.times_total) current_notnoise = np.array(df.loc[len(df)-1, 'x_1':var_name_end].copy()) indices_notnoise = np.where(current_notnoise == 1)[0] notnoise_difference = times_notnoise - len(indices_notnoise) if notnoise_difference > 0: indices_noise = np.where(current_notnoise == 0)[0] np.random.shuffle(indices_noise) for i in range(0, notnoise_difference): current_notnoise[indices_noise[i]] = 1 var_name_end = 'x_' + str(self.times_total) df.loc[len(df)-1, 'x_1':var_name_end] = current_notnoise var_name_end = 'y_' + str(self.times_total) df.loc[len(df)-1, 'y_1':var_name_end] = current_notnoise ############### GET DATAFRAME ################# def get_dataframe(self): states_to_ignore = ['HI', 'AK'] data = [] columns = [] ######### READ DATA FROM STATES CSV ######### with open('us_states_codes_centers.csv', 'r') as f: reader = csv.reader(f, delimiter=',') ######### CREATE COLUMNS ######### columns = next(reader) columns.append('N') columns.append('S') columns.append('W') columns.append('E') for i in range(1, self.times_total + 1): columns.append('x_' + str(i)) for i in range(1, self.times_total + 1): columns.append('y_' + str(i)) columns.append('pearson_r') ######### CREATE DUMMY ROWS ######### for row in reader: if row[0] in states_to_ignore: continue for i in range(0, 4): row.append(0) for i in range(0, self.times_total): row.append(0) for i in range(0, self.times_total): row.append(0) row.append(0) data.append(row) ######### BUILD DATAFRAME ######### df = pd.DataFrame(data, columns=columns) df[list('xy')] = df[list('xy')].astype(float) df = df.rename(columns={'State': 'name', 'Abbreviation': 'state'}) ######### ASSIGN LOCATIONS PREDEFINED RANGES ############ self.assign_location_ranges(df) ######### ASSIGN GEOGRAPHICAL NOT NOISE ######### # add not noise column df['notnoise_geo'] = 0 # Force location if self.location_forced is not None: location_forced_index = self.get_location_index(df, self.location_forced) df.loc[location_forced_index, 'notnoise_geo'] = 1 # Force location_range if self.location_range_forced is not None: df['notnoise_geo'] = df[self.location_range_forced] # add extra not noise self.add_extra_geographical_notnoise(df) ######### ASSIGN TEMPORAL NOT NOISE ######### # add not noise row dummy = df.loc[0].copy() dummy['state'] = 'notnoise_time' dummy['name'] = 'notnoise_time' dummy.loc['x':'notnoise_geo'] = 0 df.loc[len(df)] = dummy # Force time if self.time_forced is not None: var_name = 'x_' + str(self.time_forced) df.loc[len(df)-1, var_name] = 1 var_name = 'y_' + str(self.time_forced) df.loc[len(df)-1, var_name] = 1 # Force time_range if self.time_range_forced is not None: var_name_start = 'x_' + str(self.time_range_forced[0]) var_name_end = 'x_' + str(self.time_range_forced[1]) df.loc[len(df)-1, var_name_start:var_name_end] = 1 var_name_start = 'y_' + str(self.time_range_forced[0]) var_name_end = 'y_' + str(self.time_range_forced[1]) df.loc[len(df)-1, var_name_start:var_name_end] = 1 # add extra not noise self.add_extra_time_notnoise(df) return df