# convert data files into CSV files

import os
import numpy as np
import pandas as pd
import re


features_dir = 'C:/Users/pofner/Downloads/ariel/noisy_train'
target_dir = 'C:/Users/pofner/Downloads/ariel/params_train'

output_dir = 'data'
output_file_name = 'ariel_train_data'

export_csv = True
export_hdf5 = False # HDF5 does not work (too many columns)

n_star_parameters = 6
n_wavelengths = 55
n_time = 300
n_intermediate_targets = 2

# load test data
data_df = pd.DataFrame()
files = os.listdir(features_dir)
#files = files[0:100]


# create columns (features)
columns = ['planet_idx', 'spot_noise_idx', 'photon_noise_idx', 'star_temp', 'star_logg', 'star_rad', 'star_mass', 'star_k_mag', 'period']
for wave_idx in range(n_wavelengths):
    for time_idx in range(n_time):
        columns += ['w' + str(wave_idx + 1) + '-' + str(time_idx + 1)]
# targets
columns += ['sma', 'incl']
for wave_idx in range(n_wavelengths):
    columns += ['r' + str(wave_idx + 1)]


# read data files
read_n_files_in_a_row = int(len(files)/10)
start_idx = 0
output_file_idx = 0
#start_idx = read_n_files_in_a_row*9
#output_file_idx = 9
for file_idx in range(start_idx, len(files)):
    file_name = files[file_idx]
    star_data = {}

    file_name_regex = re.search('(\d+)_(\d+)_(\d+).txt', file_name)
    planet_idx = int(file_name_regex[1])
    spot_noise_idx = int(file_name_regex[2])
    photon_noise_idx = int(file_name_regex[3])

    star_data['planet_idx'] = planet_idx
    star_data['spot_noise_idx'] = spot_noise_idx
    star_data['photon_noise_idx'] = photon_noise_idx

    feature_file = open(os.path.join(features_dir, file_name))
    target_file = open(os.path.join(target_dir, ('%04d_%02d_%02d.txt' % (planet_idx, spot_noise_idx, photon_noise_idx))))

    # load scalar features
    for line_idx in range(n_star_parameters):
        line = feature_file.readline()
        feature_regex = re.search('#\s+(.+):\s+(\d+.?\d+)',line)
        feature_name = feature_regex[1]
        feature_value = feature_regex[2]

        star_data[feature_name] = float(feature_value)


    # load vector features
    for line_idx in range(n_wavelengths):
        line = feature_file.readline()

        feature_vector = line[:-1].split('\t') # -1 to remove '\n' character
        assert len(feature_vector) == n_time

        for vector_idx in range(n_time):
            feature_name = 'w' + str(line_idx + 1) + '-' + str(vector_idx + 1)
            star_data[feature_name] = float(feature_vector[vector_idx])


    # load intermediate targets
    for line_idx in range(n_intermediate_targets):
        line = target_file.readline()
        target_regex = re.search('#\s+(.+):\s+(\d+.?\d+)',line)
        target_name = target_regex[1]
        target_value = target_regex[2]

        star_data[target_name] = float(target_value)

    # load target vector
    line = target_file.readline()

    target_vector = line[:-1].split('\t') # -1 to remove '\n' character
    assert len(target_vector) == n_wavelengths

    for vector_idx in range(n_wavelengths):
        target_name = 'r' + str(vector_idx + 1)
        star_data[target_name] = float(target_vector[vector_idx])


    # close files
    feature_file.close()
    target_file.close()


    # add observation to table
    observation_df = pd.DataFrame(star_data, index = [file_idx])
    observation_df.index.name = 'file_idx'
    assert len(columns) == observation_df.shape[1]
    observation_df = observation_df[columns]

    data_df = data_df.append(observation_df)

    # write table on harddisk
    if ((file_idx + 1) % read_n_files_in_a_row) == 0:
        print('writing new observations up to index %d on harddisk' % (file_idx + 1))

        # export to CSV file
        if export_csv:
            output_file_idx += 1
            data_df.to_csv(output_dir + '/' + output_file_name + '_' + str(output_file_idx) + '.csv', mode = 'w', header = True)

        # export to HDF5 file
        if export_hdf5:
            # WARNING: does not work, HDF5 does not support this large number of columns!
            if (file_idx + 1) == read_n_files_in_a_row:
                # first time
                data_df.to_hdf(output_dir + '/' + output_file_name + '.h5',  key = 'ariel', mode = 'w', format = 'table', complevel = 0, complib = 'lzo')
            else:
                # append to HDF5 file
                data_df.to_hdf(output_dir + '/' + output_file_name + '.h5', key = 'ariel', mode = 'a', format = 'table', append = True, complevel = 0, complib = 'lzo')

        # delete table content
        data_df = data_df.iloc[0:0]


if data_df.shape[0] > 0:
    # write remaining data on harddisk
    print('writing remaining observations on harddisk')

    # export to CSV file
    if export_csv:
        output_file_idx += 1
        data_df.to_csv(output_dir + '/' + output_file_name + '_' + str(output_file_idx) + '.csv', mode = 'w', header = True)

    # export to HDF5 file
    if export_hdf5:
        # WARNING: does not work, HDF5 does not support this large number of columns!
        data_df.to_hdf(output_dir + '/' + output_file_name + '.h5', key = 'ariel', mode = 'a', format = 'table', append = True, complevel = 0, complib = 'lzo')
