# ============================
# SCRIPT DESCRIPTION
# ============================
# This script processes and analyzes sequential data using a Bidirectional LSTM model.
# It performs various configurations and saves training and prediction results for evaluation.
# see 00_Readme.pdf provided with the code

# cite as
# Cerek K., Gupta A., Dao D. A., Hadjiloo E. and Grabe J. 2024.
# Python Implementation of Bidirectional LSTM for Sequential Data Processing
# Source Code, DOI: 10.15480/882.13190.

# It was applied for parameter studies conducted on CRS element tests available at:
# Cerek K., Dao D. A., Hadjiloo E. and Grabe J. 2024.
# Dataset of Simulated CRS Tests for Advanced Soil Parameter Identification.
# Databank, DOI: 10.15480/882.9435.

# Authors:
# Kacper Cerek
# Arjun Gupta
# Date: August 01,2024
# ============================

# ============================
# IMPORT STATEMENTS
# ============================

import pandas as pd
import os
import numpy as np
import shutil
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Bidirectional
from sklearn.metrics import mean_squared_error
import math
from tensorflow.keras.callbacks import EarlyStopping
import time
import statistics
from tensorflow.keras.backend import clear_session

# ============================
# CONFIGURATION PARAMETERS
# ============================

# Set random seed for reproducibility
seed = 1993
tf.random.set_seed(seed)  # Set seed for TensorFlow operations
np.random.seed(seed)  # Set seed for NumPy operations

# Configuration parameters for the experiment
data_known_percentage = [0.6, 0.7, 0.8]  # Percentage of data to be used for training
point_skip = [1, 2, 3]  # Interval of points to skip in the dataset
n_steps = [10, 20, 50]  # Number of time steps to consider in each input sequence
n_batches = [5, 10, 20, 50]  # Batch sizes to be used during training
column_name = "sigma1eff[kN/m²]"  # Column name to be extracted from the dataset
folder_path = os.getcwd()  # Get the current working directory
n_features = 1  # Number of features in the dataset (univariate data)
patience = 500  # Patience for early stopping during training
n_epochs = 5000  # Maximum number of epochs for training
central_save_loc = os.path.join(folder_path, 'Results')  # Directory to save the results
test_group_path = os.path.join(folder_path, 'Test_Group')  # Directory containing the test data files

# ============================
# FUNCTION DEFINITIONS
# ============================

def split_sequence(sequence, n_step):
    """
    Splits a univariate sequence into samples of input-output pairs.

    :param sequence: The sequence of data to be split.
    :param n_step: The number of time steps in each input sample.
    :return: A tuple of input-output pairs.
    """
    X, y = [], []  # Initialize empty lists to store input-output pairs
    for j in range(len(sequence)):
        end_ix = j + n_step  # Determine the end index of the current sequence
        if end_ix > len(sequence) - 1:  # If the end index exceeds the length of the sequence, stop
            return X, y
        seq_x, seq_y = sequence[j:end_ix], sequence[end_ix]  # Slice out the input and output sequences
        X.append(seq_x)  # Add the input sequence to X
        y.append(seq_y)  # Add the corresponding output value to y
    return X, y  # Return the list of input-output pairs


def save_loss_rmse(hist, second_layer_dir):
    """
    Saves the training loss and RMSE values for each epoch to a file.

    :param hist: The history object containing loss and RMSE for each epoch.
    :param second_layer_dir: The directory where the file will be saved.
    """
    # Create a DataFrame with epoch, loss, and RMSE values
    df_loss_rmse = pd.DataFrame({
        'Epoch': range(1, len(hist.history['loss']) + 1),  # Epoch numbers
        'Loss': hist.history['loss'],  # Loss values
        'RMSE': hist.history['root_mean_squared_error']  # RMSE values
    })
    # Save the DataFrame to a text file in the specified directory
    df_loss_rmse.to_csv(os.path.join(second_layer_dir, 'loss_rmse_epochs.txt'), index=False, sep='\t')


def calculate_rmse(actual, predicted):
    """
    Calculates the Root Mean Squared Error (RMSE) between the actual and predicted values.

    :param actual: The actual target values.
    :param predicted: The predicted values from the model.
    :return: The RMSE value.
    """
    mse = mean_squared_error(actual, predicted)  # Compute mean squared error
    return math.sqrt(mse)  # Return the square root of the mean squared error


def error_calc(pred, test):
    """
    Calculates the percentage error between predicted and test data, and computes the mean error.

    :param pred: The predicted values.
    :param test: The actual test values.
    :return: A tuple of percentage errors and the mean error percentage.
    """
    error_abs = np.abs(np.subtract(pred, test))  # Compute absolute error between predictions and actual values
    error = [(error_abs[x] / np.abs(test[x])) * 100 for x in range(len(error_abs))]  # Calculate percentage error
    error_mean = statistics.mean(error)  # Compute the mean of the percentage errors
    print(f'The error mean is {error_mean}')  # Print the mean error percentage
    return error, error_mean  # Return the list of errors and the mean error


def check_for_nan(array, array_name):
    """
    Checks if the array contains any NaN values and reports their positions.

    :param array: The array to be checked.
    :param array_name: The name of the array (for reporting purposes).
    :return: A boolean indicating if the entire array is NaN.
    """
    if np.isnan(array).any():  # Check if any value in the array is NaN
        print(
            f"NaN values found in {array_name} at indices: {np.where(np.isnan(array))}")  # Report positions of NaN values
    else:
        print(f"No NaN values found in {array_name}")  # Report that no NaN values were found
    return np.isnan(array).all()  # Return True if all values in the array are NaN, otherwise False


def process_file(file):
    """
    Processes each file in the test group directory by applying different configurations and saving results.

    :param file: The file name to be processed.
    """
    data = pd.read_csv(os.path.join(test_group_path, file), sep='\t', header=0)  # Load the data from the file
    raw_seq = data[column_name].values.tolist()  # Extract the relevant column as a list
    file_base_name = os.path.splitext(file)[0]  # Get the base name of the file (without extension)
    first_layer_dir = os.path.join(central_save_loc, file_base_name)  # Create a directory for storing results

    if os.path.exists(first_layer_dir):  # Check if the directory already exists
        shutil.rmtree(first_layer_dir)  # Remove the existing directory if it exists
    os.makedirs(first_layer_dir)  # Create a new directory

    # Iterate over all combinations of known data percentages, point skips, steps, and batch sizes
    for kp in data_known_percentage:
        for ps in point_skip:
            filtered_raw_seq = [raw_seq[raw_i] for raw_i in range(len(raw_seq)) if
                                raw_i % ps == 0]  # Filter the sequence by skipping points
            for step in n_steps:
                for batch in n_batches:
                    # Process each configuration and save the results
                    process_configuration(first_layer_dir, filtered_raw_seq, kp, ps, step, batch)


def process_configuration(first_layer_dir, filtered_raw_seq, kp, ps, step, batch):
    """
    Configures and trains the LSTM model on the given data, then saves the results.

    :param first_layer_dir: Directory where results will be saved.
    :param filtered_raw_seq: The filtered sequence of data points.
    :param kp: The percentage of data to be used for training.
    :param ps: The point skip interval.
    :param step: The number of time steps in each input sequence.
    :param batch: The batch size for training.
    """
    train_size = int(len(filtered_raw_seq) * kp)  # Calculate the size of the training set
    train_seq, test_seq = filtered_raw_seq[:train_size], filtered_raw_seq[
                                                         train_size:]  # Split the data into training and test sets
    X_train, y_train = split_sequence(train_seq, step)  # Split the training sequence into input-output pairs
    X_train = np.array(X_train)  # Convert the training inputs to a NumPy array
    y_train = np.array(y_train)  # Convert the training outputs to a NumPy array

    # Create a directory name based on the configuration parameters
    result_folder_name = f'nsteps{step}_skips{ps}_datapercent{kp}_nbatch{batch}'
    second_layer_dir = os.path.join(first_layer_dir, result_folder_name)  # Define the directory to store results

    if os.path.exists(second_layer_dir):  # Check if the directory already exists
        shutil.rmtree(second_layer_dir)  # Remove the existing directory if it exists
    os.makedirs(second_layer_dir)  # Create a new directory

    # Build the LSTM model with three Bidirectional LSTM layers
    model = Sequential([

        # First Bidirectional LSTM layer
        Bidirectional(
            LSTM(units=25,  # Number of units (neurons) in the LSTM layer.
                 activation='relu',  # Activation function used for the LSTM units, in this case, ReLU.
                 return_sequences=True),
            # Whether to return the last output in the output sequence, or the full sequence.
            input_shape=(step, n_features)  # The shape of the input data, where 'step' is the number of time steps,
            # and 'n_features' is the number of features per time step.
        ),

        # Second Bidirectional LSTM layer
        Bidirectional(
            LSTM(units=25,  # Again, 25 units (neurons) in this LSTM layer.
                 activation='relu',  # ReLU activation function.
                 return_sequences=True)  # Return the full sequence to the next LSTM layer.
        ),

        # Third Bidirectional LSTM layer
        Bidirectional(
            LSTM(units=25,  # 25 units in this LSTM layer as well.
                 activation='relu',  # ReLU activation function.
                 return_sequences=False)  # Return only the last output in the output sequence (not the full sequence).
            # This is typically used when the next layer is not recurrent.
        ),

        # Output Dense layer
        Dense(1,  # Single neuron in the output layer, as this is a regression problem
              # predicting a single continuous value.
              activation='linear')  # Linear activation function, appropriate for regression tasks.
    ])

    # Compile the model with Adam optimizer, mean squared error loss, and RMSE as a metric
    model.compile(
        optimizer='adam',  # Optimizer algorithm used for training the model.
        # 'adam' (Adaptive Moment Estimation) is an advanced gradient-based optimizer
        # that combines the advantages of two other optimizers: AdaGrad and RMSProp.

        loss='mse',  # Loss function used to evaluate how well the model performs.
        # 'mse' (Mean Squared Error) is commonly used for regression tasks. It measures the average squared difference
        # between the predicted values and the actual values.

        metrics=['root_mean_squared_error']  # List of metrics to be evaluated by the model during training and testing.
        # 'root_mean_squared_error' (RMSE) is the square root of the Mean Squared Error (MSE).
        # It provides a measure of the average magnitude of the error in the same units as the target variable.
    )

    # Configure early stopping to stop training if loss does not improve after 'patience' epochs
    early_stopping = EarlyStopping(
        monitor='loss',  # Monitor the 'loss' metric during training to decide when to stop.

        min_delta=1,  # Minimum change in the monitored metric (loss) to qualify as an improvement.
        # If the change in loss is less than this value, it will be considered as no improvement.

        patience=patience,  # Number of epochs with no improvement after which training will be stopped.
        # In this case, 'patience' is set to a predefined value, allowing the model to continue training
        # for a specified number of epochs without improvement before stopping.

        verbose=1,  # Controls the verbosity of the output.
        # If set to 1, a message will be printed when early stopping is triggered.

        mode='min',  # Determines whether to stop training when the monitored metric has stopped decreasing ('min'),
        # or increasing ('max'), or when it has stopped moving in either direction ('auto').
        # Since we are monitoring 'loss', which should be minimized, 'min' is used.

        restore_best_weights=True  # If set to True, the model will restore the weights from the epoch that had
        # the best performance (lowest loss) when early stopping is triggered.
    )

    # Save the model summary to a text file
    with open(os.path.join(second_layer_dir, 'model_summary.txt'), 'w', encoding='utf-8') as f:
        model.summary(print_fn=lambda x: f.write(x + '\n'))

    start_time = time.time()  # Record the start time of training
    # Train the model on the training data
    history = model.fit(
        X_train,  # Training data features; array of shape (num_samples, step, n_features)

        y_train,  # Training data labels; array of shape (num_samples,)

        epochs=n_epochs,
        # Number of epochs to train the model. An epoch is one complete pass through the entire training dataset.

        verbose=2,  # Verbosity mode:
        # 0 = silent,
        # 1 = progress bar,
        # 2 = one line per epoch.
        # Set to 2 for more detailed output per epoch.

        batch_size=batch,
        # Number of samples per gradient update. The model processes 'batch' number of samples before updating weights.

        callbacks=[early_stopping]
        # List of callback functions to apply during training. In this case, 'early_stopping' is used to stop training early
        # if the monitored metric (loss) does not improve.
    )

    end_time = time.time()  # Record the end time of training

    # Predict the entire test sequence using the trained model
    predictions = []
    x_input = train_seq[-step:]  # Take the last 'step' values from the training sequence as the initial input
    x_input = np.array(x_input).reshape((1, step, n_features))  # Reshape input for the model

    # Loop through each value in the test sequence to generate predictions
    # This loop performs iterative predictions and updates the input for the next prediction.
    for val in range(len(test_seq)):
        yhat = model.predict(x_input, verbose=0)  # Predict the next value using the current input
        predictions.append(yhat[0][0])  # Store the predicted value in the predictions list
        x_input = np.roll(x_input, -1)  # Shift the input array by one position to the left
        x_input[0, -1, 0] = yhat[0][0]  # Update the last position of the input array with the newly predicted value

    # Predict the training sequence using the trained model
    predicted_train = []
    x_input_train = train_seq[:step]  # Take the first 'step' values from the training sequence as the initial input
    x_input_train = np.array(x_input_train).reshape((1, step, n_features))  # Reshape input for the model

    # Loop through each position in the training sequence to generate predictions
    # This loop performs iterative predictions on the training sequence and updates the input for the next prediction.
    for m in range(len(train_seq) - step):
        yhat_train = model.predict(x_input_train, verbose=0)  # Predict the next value using the current training input
        predicted_train.append(yhat_train[0][0])  # Store the predicted value in the predicted_train list
        x_input_train = np.roll(x_input_train, -1)  # Shift the input array by one position to the left
        x_input_train[0, -1, 0] = yhat_train[0][
            0]  # Update the last position of the input array with the newly predicted value

    # Check for NaN values in the predicted and actual training sequences
    allNan_train = check_for_nan(np.array(train_seq[step:]), 'train_seq[step:]')
    allNan_predicted = check_for_nan(np.array(predicted_train), 'predicted_train')

    # Convert the actual training sequence after the initial 'step' to a NumPy array for processing
    train_seq_clean = np.array(train_seq[step:])
    # Remove any NaN (Not a Number) values from the actual training sequence
    # ~np.isnan(train_seq_clean) creates a boolean array that is True for non-NaN values and False for NaN values.
    # This boolean array is used to filter out the NaN values from the actual training sequence.
    train_seq_clean = train_seq_clean[~np.isnan(train_seq_clean)]
    # Convert the list of predicted training values to a NumPy array for processing
    predicted_train_clean = np.array(predicted_train)
    # Remove any NaN (Not a Number) values from the predicted training values
    # ~np.isnan(predicted_train_clean) creates a boolean array that is True for non-NaN values and False for NaN values.
    # This boolean array is used to filter out the NaN values from the predicted training values.
    predicted_train_clean = predicted_train_clean[~np.isnan(predicted_train_clean)]

    # If there are non-NaN values in either the training data or the predictions, proceed with processing the results.
    # This includes:
    # 1. Ensuring that the lengths of the actual and predicted training sequences match for accurate error calculation.
    # 2. Saving the actual data, predicted training data, and predicted test data to separate text files.
    # 3. Calculating the percentage error between the predictions and actual test values and saving this error data to a file.
    # 4. Calculating and saving important metrics and details such as runtime, number of epochs run, final loss, final RMSE,
    #    and lengths of the data sequences to a log file.
    # 5. Saving the loss and RMSE values over epochs to a separate file.
    # If all values are NaN, log this issue and note that no calculations were possible.
    if not allNan_train or not allNan_predicted:
        # Ensure the lengths of the sequences match for error calculation
        min_length = min(len(train_seq_clean), len(predicted_train_clean))
        train_seq_clean = train_seq_clean[:min_length]
        predicted_train_clean = predicted_train_clean[:min_length]

        # Save the actual, predicted, and error data to text files
        df_actual = pd.DataFrame({'Actual': filtered_raw_seq[:]})
        df_actual.to_csv(os.path.join(second_layer_dir, 'actual_data.txt'), index=True, sep='\t')

        df_predicted_train = pd.DataFrame({'Predicted Training': predicted_train})
        df_predicted_train.to_csv(os.path.join(second_layer_dir, 'predicted_train_data.txt'), index=True, sep='\t')

        df_predicted_test = pd.DataFrame({'Predicted Test': predictions})
        df_predicted_test.to_csv(os.path.join(second_layer_dir, 'predicted_test_data.txt'), index=True, sep='\t')

        # Calculate the percentage error between predictions and actual test values
        error_percent, perc_mean = error_calc(np.array(predictions), np.array(test_seq))
        df_error_percent = pd.DataFrame({'Error': error_percent})
        df_error_percent.to_csv(os.path.join(second_layer_dir, 'error_percent.txt'), index=True, sep='\t')

        runtime = end_time - start_time  # Calculate the total runtime
        num_calculated_epoch = len(history.history['loss'])  # Number of epochs actually run

        # Save a log file with all important metrics and details
        with open(os.path.join(second_layer_dir, 'log_file.txt'), 'w') as log_file:
            log_file.write(f"Time of running: {runtime} seconds\n")
            log_file.write(f"Number of epochs set: {n_epochs}\n")
            log_file.write(f"Number of epochs calculated: {num_calculated_epoch}\n")
            log_file.write(f"Best epoch chosen at end: {num_calculated_epoch - patience}\n")
            log_file.write(f"Final loss: {history.history['loss'][-1]}\n")
            log_file.write(f"Final RMSE: {history.history['root_mean_squared_error'][-1]}\n")
            log_file.write(f"Length of actual data: {len(filtered_raw_seq)}\n")
            log_file.write(f"Length of predicted training: {len(predicted_train)}\n")
            log_file.write(f"Length of predicted test: {len(predictions)}\n")
            log_file.write(f"Mean Error Percentage: {perc_mean}\n")

        save_loss_rmse(history, second_layer_dir)  # Save the loss and RMSE values over epochs to a file
    else:
        print('All values were returned as NaN (not a number). No calculations possible')
        # Save a log file indicating that all predictions were NaN
        with open(os.path.join(second_layer_dir, 'log_file.txt'), 'w') as log_file:
            log_file.write(f"Time of running: {end_time - start_time} seconds\n")
            log_file.write("All values returned as NaN (not a number). No calculations possible")

    clear_session()  # Clear the TensorFlow session to free up memory


# ============================
# MAIN SCRIPT EXECUTION
# ============================

# This block of code will only execute if this script is run directly (i.e., not imported as a module).
# It serves as the entry point for the script when executed.
if __name__ == '__main__':
    # Retrieve a list of all text files in the test group directory.
    # These files are expected to contain data for which predictions and evaluations will be performed.
    files = [file for file in os.listdir(test_group_path) if file.endswith(".txt")]

    # Iterate over each file in the list of text files.
    # For each file, call the process_file function to read, process, and analyze the data contained in the file.
    for file in files:
        process_file(file)  # Process each file