import matplotlib.pyplot as plt
from matplotlib import pyplot
from sklearn.datasets import make_regression
from sklearn.preprocessing import MinMaxScaler
import sklearn
import numpy as np
from scipy.stats import pearsonr
import csv
import numpy as np
import pandas as pd
import json
import random
from data_exporter import DataExporter
from data_states_config import StatesDataConfig
      

class CorrelatedDataGenerator:
    
    def __init__(self, config):
        # Geo temporal estructure
        self.geotemp_config = config
        
        # Correlation info 
        self.r = 0
        self.monotonic = False
        self.type_monotonic = None #1 indicates increasing, 0 indicates decreasing
        self.r_exception = 0
        self.monotonic_exception = False
        self.type_monotonic_exception = None
        
        # Min and max values for each variable
        self.x_start_interval = [20, 30]
        self.x_end_interval = [75, 85]
        self.y_start_interval = [25, 35]
        self.y_end_interval = [40, 50]
        
        # Exporter path
        self.path = 'data/'
        
       
    def increase_ranges(self):
        self.x_start_interval = [5, 50]
        self.x_end_interval = [51, 95]
        self.y_start_interval = [10, 40]
        self.y_end_interval = [41, 55]
        
    def normal_ranges(self):
        self.x_start_interval = [20, 30]
        self.x_end_interval = [75, 85]
        self.y_start_interval = [25, 35]
        self.y_end_interval = [40, 50]

    def get_noise(self, r):
        if abs(r) >= 0.8: return 10
        else: return 100

    def generate_correlated_data(self, size, r, monotonic=False, type_monotonic=None, scale_x=None, scale_y=None):
        noise = self.get_noise(r)
        x = np.zeros(size)
        y = np.zeros(size)
        
        ######### GENERATE DATA UNITL DESIRED PEARSON R ##########
        while True:
            ######### GENERATE REGRESSION #########
            x_tmp, y_tmp = make_regression(n_samples=size, n_features=1, n_informative=1, noise=noise)
            x_tmp = (x_tmp.T)[0]
            
            ######### SCALE #########            
            if scale_x is None:
                x_start = np.random.randint(self.x_start_interval[0], self.x_start_interval[1])
                x_end = np.random.randint(self.x_end_interval[0], self.x_end_interval[1])
                scale_x = [x_start, x_end]
            if scale_y is None:
                y_start = np.random.randint(self.y_start_interval[0], self.y_start_interval[1])
                y_end = np.random.randint(self.y_end_interval[0], self.y_end_interval[1])
                scale_y = [y_start, y_end]
            x_scaled = sklearn.preprocessing.minmax_scale(x_tmp, feature_range=(scale_x[0], scale_x[1]))
            y_scaled = sklearn.preprocessing.minmax_scale(y_tmp, feature_range=(scale_y[0], scale_y[1]))
            for i in range(0, size):
                x[i] = x_scaled[i]
                y[i] = y_scaled[i]
                
            ######## MAKE PERFECT CORRELATION ###############            
            if abs(r) == 0.8:
                x.sort()
                y.sort()
                if r == -0.8:
                    y = y[::-1]
            else:
                if monotonic:
                    indices = x_scaled.argsort()
                    for i, index in enumerate(indices):
                        x[i] = x_scaled[index]
                        y[i] = y_scaled[index]
                        
            if not monotonic:
                indices = list(range(0, size))
                np.random.shuffle(indices)
                x_tmp = x.copy()
                y_tmp = y.copy()
                for i in range(0, size):
                    x[i] = x_tmp[indices[i]]
                    y[i] = y_tmp[indices[i]]
                    
            if type_monotonic == -1:
                x = x[::-1]
                y = y[::-1]
                
            ######### BREAK IF DESIRED COEF #########
            # TODO: make this more generic
            coef = pearsonr(x, y)
            if r == 0.2 and abs(coef[0]) <= 0.2:
                break
            if r == 0.8 and coef[0] >= 0.95:
                break
            if r == -0.8 and coef[0] <= -0.95:
                break
        
        ######### RETURN #########
        return (x, y, coef)

    def generate_geotemp_data(self):
        # Type monotonic can be increasing (1) or decreasing (-1)
        
        ######### GET TEMP INFO FROM CONFIG #########
        times_total = self.geotemp_config.times_total
                
        ######### GET DATAFRAME OF LOCATIONS ######### 
        df = self.geotemp_config.get_dataframe()    
        
        ######### ITERATE THROUGH LOCATIONS #########
        for index, row in df.iterrows():
            if row['state'] == 'notnoise_time':
                continue
            ######### GENERATE AND ADD DATA #########
            is_notnoise = df.loc[index, 'notnoise_geo'] 
            x, y, coef = [None, None, None]
            if is_notnoise == 1:
                x, y, coef = self.generate_correlated_data(times_total, self.r, self.monotonic, self.type_monotonic)
            else:
                x, y, coef = self.generate_correlated_data(times_total, self.r_exception, self.monotonic_exception, self.type_monotonic_exception)            
            for i in range(1, times_total + 1):
                col_name = 'x_' + str(i)
                df.loc[index, col_name] = x[i-1]
                col_name = 'y_' + str(i)
                df.loc[index, col_name] = y[i-1]
                df.loc[index, 'pearson_r'] = coef[0]
        ######### EXCECUTE TIME EXCEPTIONS #########
        self.add_time_noise(df)
        self.force_one_time(df)
        self.force_time_range(df)
        
        ######### RE COMPUTE R #########        
        self.compute_r(df)
        
        ######### EXPORT DATA TO FILES #########
        exporter = DataExporter(self.path)
        exporter.export(df, self.r, self.monotonic, self.type_monotonic, self.r_exception, self.monotonic_exception, self.type_monotonic_exception, self.geotemp_config)        
        return df
    
    def add_time_noise(self, df):
        times_total = self.geotemp_config.times_total
        for index, row in df.iterrows():
            if row['state'] == 'notnoise_time':
                continue
            is_notnoise = df.loc[index, 'notnoise_geo'] 
            if is_notnoise == 1:
                # get scale_x
                x_end = 'x_' + str(times_total)
                x_current = np.array(row.loc['x_1':x_end])
                x_min = np.min(x_current)
                x_max = np.max(x_current)
                
                # get scale_y
                y_end = 'y_' + str(times_total)
                y_current = np.array(row.loc['y_1':y_end])
                y_min = np.min(y_current)
                y_max = np.max(y_current)
                
                # Get new data
                x, y, coef = self.generate_correlated_data(times_total, self.r_exception, self.monotonic_exception, self.type_monotonic_exception, [x_min, x_max], [y_min, y_max])
                
                # Assign
                notnoise = np.array(df.loc[len(df)-1, 'x_1':x_end].copy())
                for notnoise_index, notnoise_value in enumerate(notnoise):
                    if notnoise_value == 0:
                        col_name = 'x_' + str(notnoise_index + 1)
                        df.loc[index, col_name] = x[notnoise_index]
                        col_name = 'y_' + str(notnoise_index + 1)
                        df.loc[index, col_name] = y[notnoise_index]
    def force_one_time(self, df):
        if self.geotemp_config.time_forced is None:
            return 0
        locations_noise = round(self.geotemp_config.locations_total*self.geotemp_config.locations_noise_perc)
        locations_notnoise = self.geotemp_config.locations_total - locations_noise
        
        n_times = self.geotemp_config.times_total
        current_conf = df.loc[len(df)-1, 'x_1':'x_'+str(n_times)]
        indices_notnoise = np.where(current_conf == 1)[0]
        #self.increase_ranges()
        for notnoise_index in indices_notnoise:
            time = notnoise_index + 1        
            x, y, coef = self.generate_correlated_data(locations_notnoise, self.r, False, None)
            state_index = 0
            for index, row in df.iterrows():
                if row['state'] == 'notnoise_time':
                    continue
                if row['notnoise_geo'] == 1:
                    x_name = 'x_' + str(time)
                    y_name = 'y_' + str(time)
                    df.loc[index, x_name] = x[state_index]
                    df.loc[index, y_name] = y[state_index]
                    state_index += 1
        #self.normal_ranges()
    
    def force_time_range(self, df):
        if self.geotemp_config.time_range_forced is None:
            return 0
        locations_noise = round(self.geotemp_config.locations_total*self.geotemp_config.locations_noise_perc)
        locations_notnoise = self.geotemp_config.locations_total - locations_noise
        
        start = self.geotemp_config.time_range_forced[0]
        end = self.geotemp_config.time_range_forced[1]
        n_times = end - start + 1
        for index, row in df.iterrows():
            if row['state'] == 'notnoise_time':
                continue
            if row['notnoise_geo'] == 1:
                x, y, coef = self.generate_correlated_data(n_times, self.r, self.monotonic, self.type_monotonic)
                for i in range(0, n_times):
                    x_name = 'x_' + str(start + i)
                    y_name = 'y_' + str(start + i)
                    df.loc[index, x_name] = x[i]
                    df.loc[index, y_name] = y[i]
        #self.normal_ranges()
    
    def shuffle_all_locations(self, df, location_forced=None):
        orginal_df = df.copy()
        indices = np.array(df.index.tolist())
        indices = indices[:-1]
        np.random.shuffle(indices)
        for index, row in df.iterrows():
            if row['state'] != 'notnoise_time':
                df.loc[index, 'x_1':'notnoise_geo'] = orginal_df.loc[indices[index],'x_1':'notnoise_geo']
        if location_forced is not None:
            state_index = df[df['state'] == location_forced].index.values.astype(int)[0]
            if df.loc[state_index, 'notnoise_geo'] != 1:
                indices_notnoise =  np.array(df[df['notnoise_geo'] == 1].index.tolist())
                swap_index = np.random.randint(0, len(indices_notnoise) - 1)
                swap_index = indices_notnoise[swap_index]
                tmp = df.loc[swap_index, 'x_1':'notnoise_geo']
                df.loc[swap_index, 'x_1':'notnoise_geo'] = df.loc[state_index, 'x_1':'notnoise_geo']
                df.loc[state_index, 'x_1':'notnoise_geo'] = tmp
    
    def shuffle_location_range(self, df, location_range_forced):
        orginal_df = df.copy()
        ######### SHUFFLE INSIDE LOCATION RANGE #########
        indices = np.array(df[(df[location_range_forced] == 1) & (df['state'] != 'notnoise_time')].index.tolist())
        np.random.shuffle(indices)
        n = 0
        for index, row in df.iterrows():
            if row['state'] != 'notnoise_time' and row[location_range_forced] == 1:
                df.loc[index, 'x_1':'notnoise_geo'] = orginal_df.loc[indices[n],'x_1':'notnoise_geo']
                n += 1
                
        ######### SHUFFLE OUTSIDE LOCATION RANGE #########
        indices = np.array(df[(df[location_range_forced] == 0) & (df['state'] != 'notnoise_time')].index.tolist())
        np.random.shuffle(indices)
        n = 0
        for index, row in df.iterrows():
            if row['state'] != 'notnoise_time' and row[location_range_forced] == 0:
                df.loc[index, 'x_1':'notnoise_geo'] = orginal_df.loc[indices[n],'x_1':'notnoise_geo']
                n += 1
    
    def shuffle_times(self, df, n_times, time_forced=None,  monotony_forced=False):
        original_df = df.copy()
        indices = np.array(range(1, n_times + 1))
        if monotony_forced:
            indices = indices[::-1]
        else:
            np.random.shuffle(indices)
        for index, value in enumerate(indices):
            df['x_' + str(index+1)] = original_df['x_' + str(value)]
            df['y_' + str(index+1)] = original_df['y_' + str(value)]
        if time_forced is not None:
            if df.loc[len(df)-1, 'x_' + str(time_forced)] != 1:
                current_conf = df.loc[len(df)-1, 'x_1':'x_'+str(n_times)]
                indices_notnoise = np.where(current_conf == 1)[0]
                np.random.shuffle(indices_notnoise)
                swap_index = indices_notnoise[0] # just choose first
                tmp_x = df['x_' + str(swap_index+1)].copy()
                tmp_y = df['y_' + str(swap_index+1)].copy()
                df['x_' + str(swap_index+1)] = df['x_'+str(time_forced)]
                df['y_' + str(swap_index+1)] = df['y_'+str(time_forced)]
                df['x_' + str(time_forced)] = tmp_x
                df['y_' + str(time_forced)] = tmp_y
    
    def shuffle_time_range(self, df, n_times, time_range_forced, monotony_forced=False):
        original_df = df.copy()
        ######### SHUFFLE INSIDE TIME RANGE #########
        indices = np.array(range(time_range_forced[0], time_range_forced[1] + 1))
        if monotony_forced:
            indices = indices[::-1]
        else:
            np.random.shuffle(indices)
        for index, value in enumerate(indices):
            df['x_' + str(time_range_forced[0]+index)] = original_df['x_' + str(value)]
            df['y_' + str(time_range_forced[0]+index)] = original_df['y_' + str(value)]
        
        ######### SHUFFLE OUTSIDE TIME RANGE #########
        all_indices = np.array(range(1, n_times + 1))
        indices = np.array(range(time_range_forced[0], time_range_forced[1] + 1))
        indices_ori = np.setdiff1d(all_indices, indices)
        indices = np.setdiff1d(all_indices, indices)
        if monotony_forced:
            indices = indices[::-1]
        else:
            np.random.shuffle(indices)
        for index, value in enumerate(indices):
            df['x_' + str(indices_ori[index])] = original_df['x_' + str(value)]
            df['y_' + str(indices_ori[index])] = original_df['y_' + str(value)]
            
    def compute_r(self, data):
        result = []
        if self.geotemp_config.time_forced is not None:
            x = []
            y = []
            for index, row in data.iterrows():
                if row['notnoise_geo'] == 1:
                    name = 'x_' + str(self.geotemp_config.time_forced)
                    x.append(row[name])
                    name = 'y_' + str(self.geotemp_config.time_forced)
                    y.append(row[name])
            coef = pearsonr(x, y)
            for index, row in data.iterrows():
                if row['notnoise_geo'] == 1:
                    data.loc[index, 'pearson_r'] = coef[0]
            result.append(coef)            
        elif self.geotemp_config.time_range_forced is not None:
            for index, row in data.iterrows():
                if row['notnoise_geo'] == 1:
                    x_start = 'x_' + str(self.geotemp_config.time_range_forced[0])
                    x_end = 'x_' + str(self.geotemp_config.time_range_forced[1])
                    x = data.loc[index, x_start:x_end]
                    y_start = 'y_' + str(self.geotemp_config.time_range_forced[0])
                    y_end = 'y_' + str(self.geotemp_config.time_range_forced[1])
                    y = data.loc[index, y_start:y_end]
                    coef = pearsonr(x, y)
                    data.loc[index, 'pearson_r'] = coef[0]
                    result.append(coef)            
        else:
            for index, row in data.iterrows():
                if row['notnoise_geo'] == 1:
                    x_start = 'x_1'
                    x_end = 'x_' + str(self.geotemp_config.times_total)
                    x = data.loc[index, x_start:x_end]
                    y_start = 'y_1'
                    y_end = 'y_' + str(self.geotemp_config.times_total)
                    y = data.loc[index, y_start:y_end]
                    coef = pearsonr(x, y)
                    data.loc[index, 'pearson_r'] = coef[0]
                    result.append(coef)        
        return result
    
    def get_min_max_r(self, df):
        r_min = 2
        r_max = -2
        for index, row in df.iterrows():
            if row['notnoise_geo'] == 1:
                r_min = min(r_min, row['pearson_r'])
                r_max = max(r_max, row['pearson_r'])
        return (r_min, r_max)