# rearrange data in the CSV files so that all observations of a planet are in the same CSV file

import os
import pandas as pd

data_dir = 'data'
data_file_prefix = 'ariel_test_data_'

n_data_files = 10

check_file_indices = False

if check_file_indices:
    # check if planet indicices are stored in ascending order
    max_planet_idx = 0
    for file_idx in range(1, n_data_files + 1):
        print('checking data file %d' % file_idx)
        data_df = pd.read_csv(os.path.join(data_dir, data_file_prefix + str(file_idx) + '.csv'), index_col = 'file_idx', usecols = ['file_idx', 'planet_idx'])

        min_planet_idx_in_data_file = min(data_df['planet_idx'])
        max_planet_idx_in_data_file = max(data_df['planet_idx'])

        if (min_planet_idx_in_data_file >= max_planet_idx) and (max_planet_idx_in_data_file > min_planet_idx_in_data_file):
            max_planet_idx = max_planet_idx_in_data_file
        else:
            raise Exception('planet indices are not steadily increasing!')
    print('planet indexing is OK')


pre_data_df = pd.read_csv(os.path.join(data_dir, data_file_prefix + str(1) + '.csv'), index_col = 'file_idx')
for pre_file_idx in range(1, n_data_files):
    print('checking data file %d' % pre_file_idx)
    post_data_df = pd.read_csv(os.path.join(data_dir, data_file_prefix + str(pre_file_idx + 1) + '.csv'), index_col = 'file_idx')

    pre_max_planet_idx = pre_data_df['planet_idx'].max()
    post_min_planet_idx = post_data_df['planet_idx'].min()

    if post_min_planet_idx == pre_max_planet_idx:
        print('shifting planet %d from data file %d to data file %d' % (pre_max_planet_idx, pre_file_idx + 1, pre_file_idx))

        # add planet to pre data frame
        tmp_df = post_data_df[post_data_df['planet_idx'] == pre_max_planet_idx]
        pre_data_df = pre_data_df.append(tmp_df, ignore_index = False, verify_integrity = True)

        # delete planet from post data frame
        post_data_df = post_data_df[post_data_df['planet_idx'] != pre_max_planet_idx]

    # save pre data file to harddisk
    print('saving file "%s"' % os.path.join(data_dir, data_file_prefix + str(pre_file_idx) + '.csv'))
    pre_data_df.to_csv(os.path.join(data_dir, data_file_prefix + str(pre_file_idx) + '.csv'), mode = 'w', header = True)

    pre_data_df = post_data_df
    post_data_df = None

# save pre data file to harddisk
print('saving file "%s"' % os.path.join(data_dir, data_file_prefix + str(pre_file_idx + 1) + '.csv'))
pre_data_df.to_csv(os.path.join(data_dir, data_file_prefix + str(pre_file_idx + 1) + '.csv'), mode = 'w', header = True)
