"""
This script processes an SPSS dataset to analyze and document missing data patterns. This script can be used to reproduce the missing analyses in the manuscript "Individual Differences in Students' Reward-Avoidance Sensitivity as Predictors of Academic Progression During Elementary School"

It does the following:
1. Generates descriptive statistics for all variables and saves them to a CSV file (for checking correct import of data).
2. Calculates and documents the number and percentage of missing values for each variable.
3. Generates frequency tables for all missing patterns over times and their groups (monotone/non-monotone/ignorable).
4. Selects a subset of the data for further analysis in R (see separate script)


To reuse script:
1. Ensure all paths to files work with your directory structure. The script assumes the presence of an SPSS file named "SOGMM_noID.sav" in the working directory.
2. Replaces any specific missing value indicators with pandas' NA.
3. The two elements of the missing pattern dictionary (pattern_to_group_missingY and pattern_to_group_missingX) should be updated to reflect the missing patterns in the dataset.
4. The script assumes that the missing patterns are already known and defined in the dictionary, and based on the current study. For reuse, the user should define the missing patterns and their respective groups based on both number of observations and patterns present in their data. 
5. Finally, the element categorization in "missing_groupY" is also dependent on the user's data and preferences and should be updated accordingly (there may be other comparisons that are relevant in for example larger larger datasets). 

Dependencies:
- pandas
- numpy
- pyreadstat
- os
- IPython.display

Programmer: A. R. Olseth
Date: 2024-09-23
"""
import numpy as np
import pyreadstat
import os
import pandas as pd
from IPython.display import display


# Read the SPSS file into a dataframe
df, meta = pyreadstat.read_sav("SOGMM_noID.sav")

# Recode missing values
df = df.replace(-999, pd.NA)

# create table with descriptives for all variables
df.describe().to_csv("descriptives.csv", sep=";")

# Select subset of data
df = df[
    [
        "gend",
        "FS_BIS",
        "FS_DR",
        "FS_ESEM_RR",
        "nor1",
        "nor2",
        "nor3",
        "nor4",
        "nor5",
        "mat1",
        "mat2",
        "mat3",
        "mat4",
        "mat5",
        "eng1",
        "eng2",
        "eng3",
        "eng4",
        "eng5",
        "sam1",
        "sam2",
        "sam3",
        "sam4",
        "sam5",
    ]
]

# Describe missing values
missing_values = pd.DataFrame()
missing_values["n_missing"] = df.isnull().sum()
missing_values["percentage"] = df.isnull().sum() / len(df) * 100
missing_values.to_csv("missing_values.csv", sep=";")

# Removing gender from the missing pattern as no values are missing
df["missing_pattern_complete"] = (
    df[
        [
            "FS_BIS",
            "FS_DR",
            "FS_ESEM_RR",
            "nor1",
            "nor2",
            "nor3",
            "nor4",
            "nor5",
            "mat1",
            "mat2",
            "mat3",
            "mat4",
            "mat5",
            "eng1",
            "eng2",
            "eng3",
            "eng4",
            "eng5",
            "sam1",
            "sam2",
            "sam3",
            "sam4",
            "sam5",
        ]
    ]
    .isnull()
    .apply(lambda x: "".join(["0" if val else "1" for val in x]), axis=1)
)

# Gruping the missing patterns on the Y variable
pattern_to_group_missingY = {
    "11111111111111111111111": 1,
    "00011111111111111111111": 1,
    "11110111101111011110111": 2,
    "11111111111110111111111": 2,
    "11111111111111011111111": 2,
    "11111101111011110111101": 2,
    "11111011110111101111011": 2,
    "11101111011110111101111": 2,
    "11111111111110011111111": 2,
    "11111001110011100111001": 2,
    "11111111111111110111111": 2,
    "11101111011110011101111": 2,
    "11111111111111101111111": 2,
    "11111111111111111011111": 2,
    "11111111111111111111101": 2,
    "11111111111111111101111": 2,
    "11111111111111111111011": 2,
    "11111101111010110111101": 2,
    "11111111111111111111110": 2,
    "11110111101111011110101": 2,
    "11101101011010110101101": 2,
    "11110111101111011010111": 2,
    "11110011100111001110011": 2,
    "11111111111111011110111": 2,
    "11111111111110111110111": 2,
    "11101111011110111101011": 2,
    "11100000000000000000000": 2,
    "00001111011110111101111": 2,
    "00000111001110011100111": 2,
    "00010111101111011110111": 2,
    "00011001110011100111001": 2,
    "00001101011010110101101": 2,
    "00011111111110111111111": 2,
    "00011011110111101111011": 2,
    "00011101111011110111101": 2,
    "00011111111110011111111": 2,
    "00001111011110011101111": 2,
    "00011111111111011111111": 2,
    "00011111111111110111111": 2,
    "00010111101110011110111": 2,
    "00000011000110001100011": 2,
    "00001011010110101101011": 2,
    "00011111111111111110111": 2,
    "00011111111111101111111": 2,
    "11110000100001000010000": 3,
    "11110000100000000010000": 3,
    "00010000100001000010000": 3,
    "00010000100000000010000": 3,
    "00001000010000100001000": 4,
    "11111000110001100011000": 4,
    "11111000110000100011000": 4,
    "11101000010000100001000": 4,
    "11111000110001000011000": 4,
    "11111000110001100010000": 4,
    "00011000110001100011000": 4,
    "00001100011000110001100": 5,
    "11111100111001110011100": 5,
    "11111100111001010011100": 5,
    "11110100101001010010100": 5,
    "11101100011000110001100": 5,
    "11111100111000110011100": 5,
    "11111100111001100011100": 5,
    "00011100111001110011100": 5,
    "00010100101001010010100": 5,
    "00000100001000010000100": 5,
    "00011100111000110011100": 5,
    "11111110111101111011110": 6,
    "11111110111100111011110": 6,
    "11111110111101011011110": 6,
    "11111110111101111010110": 6,
    "11111010110101101011010": 6,
    "11111110111101110011110": 6,
    "11110110101101011010110": 6,
    "00011110111101111011110": 6,
    "00011010110101101011010": 6,
    "00000110001100011000110": 6,
}


# Gruping the missing patterns on the Y variable
pattern_to_group_missingX = {
    "11111111111111111111111": 1,
    "11110111101111011110111": 1,
    "11111111111110111111111": 1,
    "11111111111111011111111": 1,
    "11111101111011110111101": 1,
    "11111011110111101111011": 1,
    "11101111011110111101111": 1,
    "11111111111110011111111": 1,
    "11111001110011100111001": 1,
    "11111111111111110111111": 1,
    "11101111011110011101111": 1,
    "11111111111111101111111": 1,
    "11111111111111111011111": 1,
    "11111111111111111111101": 1,
    "11111111111111111101111": 1,
    "11111111111111111111011": 1,
    "11111101111010110111101": 1,
    "11111111111111111111110": 1,
    "11110111101111011110101": 1,
    "11101101011010110101101": 1,
    "11110111101111011010111": 1,
    "11110011100111001110011": 1,
    "11111111111111011110111": 1,
    "11111111111110111110111": 1,
    "11101111011110111101011": 1,
    "11100000000000000000000": 1,
    "11110000100001000010000": 1,
    "11110000100000000010000": 1,
    "11111000110001100011000": 1,
    "11111000110000100011000": 1,
    "11101000010000100001000": 1,
    "11111000110001000011000": 1,
    "11111000110001100010000": 1,
    "11111100111001110011100": 1,
    "11111100111001010011100": 1,
    "11110100101001010010100": 1,
    "11101100011000110001100": 1,
    "11111100111000110011100": 1,
    "11111100111001100011100": 1,
    "11111110111101111011110": 1,
    "11111110111100111011110": 1,
    "11111110111101011011110": 1,
    "11111110111101111010110": 1,
    "11111010110101101011010": 1,
    "11111110111101110011110": 1,
    "11110110101101011010110": 1,
    "00011111111111111111111": 2,
    "00001111011110111101111": 2,
    "00000111001110011100111": 2,
    "00010111101111011110111": 2,
    "00011001110011100111001": 2,
    "00001101011010110101101": 2,
    "00001000010000100001000": 2,
    "00011111111110111111111": 2,
    "00011011110111101111011": 2,
    "00011101111011110111101": 2,
    "00011111111110011111111": 2,
    "00001100011000110001100": 2,
    "00001111011110011101111": 2,
    "00011111111111011111111": 2,
    "00011111111111110111111": 2,
    "00010111101110011110111": 2,
    "00000011000110001100011": 2,
    "00001011010110101101011": 2,
    "00011111111111111110111": 2,
    "00011111111111101111111": 2,
    "00010000100001000010000": 2,
    "00010000100000000010000": 2,
    "00011000110001100011000": 2,
    "00011100111001110011100": 2,
    "00010100101001010010100": 2,
    "00000100001000010000100": 2,
    "00011100111000110011100": 2,
    "00011110111101111011110": 2,
    "00011010110101101011010": 2,
    "00000110001100011000110": 2,
}

# Create a new column in the DataFrame to store the group assignment
df["Y_missing_pattern"] = df["missing_pattern_complete"].map(pattern_to_group_missingY)
df["X_missing_pattern"] = df["missing_pattern_complete"].map(pattern_to_group_missingX)

# Create a table with the frequency of the missing patterns and the freque of the individual missing patterns in  missing_pattern_complete
missing_patterns_freq = pd.DataFrame()
missing_patterns_freq["missing_pattern_complete"] = df[
    "missing_pattern_complete"
].value_counts()
missing_patternsY = pd.DataFrame()
missing_patternsY["Y_missing_pattern"] = df["Y_missing_pattern"].value_counts()
missing_patternsX = pd.DataFrame()
missing_patternsX["X_missing_pattern"] = df["X_missing_pattern"].value_counts()


# Create a table with missing pattern, count, and percentage of total
missing_patterns_table = pd.DataFrame()
missing_patterns_table["missing_pattern"] = missing_patterns_freq.index
missing_patterns_table["count"] = missing_patterns_freq["missing_pattern_complete"]
missing_patterns_table["percentage"] = (
    missing_patterns_freq["missing_pattern_complete"] / len(df) * 100
)

# Create a dictionary to map the pattern numbers to pattern names for Y_missing_pattern
pattern_names_Y = {
    1: "Complete",
    2: "Ignorable",
    3: "Missing after T1",
    4: "Missing after T2",
    5: "Missing after T3",
    6: "Missing after T4",
}

# Create a new column in the DataFrame to store the pattern names for Y_missing_pattern
df["Y_missing_pattern_name"] = df["Y_missing_pattern"].map(pattern_names_Y)

# Create a table with the frequency of the pattern names for Y_missing_pattern
missing_patternsY_table = pd.DataFrame()
missing_patternsY_table["Pattern Name"] = (
    df["Y_missing_pattern_name"].value_counts().index
)
missing_patternsY_table["Count"] = df["Y_missing_pattern_name"].value_counts().values

# Save the table to a csv file
missing_patternsY_table.to_csv("missing_patternsY_table.csv", sep=";")

# Create a dictionary to map the pattern numbers to pattern names for X_missing_pattern
pattern_names_X = {
    1: "Complete",
    2: "Missing all X",
}

# Create a new column in the DataFrame to store the pattern names for X_missing_pattern
df["X_missing_pattern_name"] = df["X_missing_pattern"].map(pattern_names_X)

# Create a table with the frequency of the pattern names for X_missing_pattern
missing_patternsX_table = pd.DataFrame()
missing_patternsX_table["Pattern Name"] = (
    df["X_missing_pattern_name"].value_counts().index
)
missing_patternsX_table["Count"] = df["X_missing_pattern_name"].value_counts().values

# Save the table to a csv file
missing_patternsX_table.to_csv("missing_patternsX_table.csv", sep=";")

# I want to calculate cohens d for all variables comparing patterns 1 and 2 to the other patterns
# Combine missing groups 1 and 2, and 3 to 5
df["missing_groupY"] = df["Y_missing_pattern"].replace({1: 1, 2: 1, 3: 2, 4: 2, 5: 2})

# write out the data to a csv file for use in mplus
df.to_csv("missing_groupY.csv", sep=";")