# Load the readxl package
library(readxl)

# Replace "your_file.xlsx" with the path to your Excel file
# Specify NA values as blank cells and 0
data <- read_excel("C:\\[...]\\SM_SED_INCOMEDAT_FINAL.xlsx", na = c("",":", 0))

# View the first few rows of the imported data
head(data)


library(ggplot2)

# Get the column names containing "_F_" in their header
cols_F <- grep("_F_", colnames(data), value = TRUE)

# Create a subset with variables containing "_F_" in their header
subset_F <- data[, cols_F, with = FALSE]

# Get the column names containing "_dif" in their header within subset_F
cols_F_dif <- grep("_dif", colnames(subset_F), value = TRUE)

# Create a subset with variables containing "_dif" in their header within subset_F
subset_F_dif <- subset_F[, cols_F_dif, with = FALSE]


# Get the column names containing "_M_" in their header
cols_M <- grep("_M_", colnames(data), value = TRUE)

# Create a subset with variables containing "_M_" in their header
subset_M <- data[, cols_M, with = FALSE]

# Get the column names containing "_dif" in their header within subset_M
cols_M_dif <- grep("_dif", colnames(subset_M), value = TRUE)

# Create a subset with variables containing "_dif" in their header within subset_M
subset_M_dif <- subset_M[, cols_M_dif, with = FALSE]

# Check the data types of subset_F_dif and subset_M_dif
print(class(subset_F_dif))
print(class(subset_M_dif))

# Inspect the first few rows of subset_F_dif and subset_M_dif
print(head(subset_F_dif))
print(head(subset_M_dif))

# Check for missing values
print(sum(is.na(subset_F_dif)))
print(sum(is.na(subset_M_dif)))



# Set up the plot area
par(mfrow = c(1, 1))

# Draw boxplot for subset_F_dif in orange with transparency
boxplot(subset_F_dif, main = "Comparison of subset_F_dif and subset_M_dif",
        xlab = "Variables", ylab = "Values", col = rgb(1, 0.5, 0, alpha = 0.5), border = rgb(1, 0.5, 0, alpha = 0.5))

# Draw boxplot for subset_M_dif in purple with transparency
boxplot(subset_M_dif, add = TRUE, col = rgb(0.5, 0, 0.5, alpha = 0.5), border = rgb(0.5, 0, 0.5, alpha = 0.5))

# Add a legend outside of the plot area
legend("topright", legend = c("subset_F_dif", "subset_M_dif"), fill = c(rgb(1, 0.5, 0, alpha = 0.5), rgb(0.5, 0, 0.5, alpha = 0.5)), bty = "n")

 # Expand the plot area
 par(mar = c(5, 6, 4, 8))  # Adjust the margin to make space for the legend
 

# Extract the variables from the dataset
variables_age <- c("LT18_2020_F_MED_", "LT18_2020_M_MED_", "LT18_2020_T_MED_",
                   "B18-64_2020_F_MED_", "B18-64_2020_M_MED_", "B18-64_2020_T_MED_",
                   "G65_2020_F_MED_", "G65_2020_M_MED_", "G65_2020_T_MED_")

# Ensure all data columns are numeric
data_age <- data[variables_age]
data_age[] <- lapply(data_age, function(x) as.numeric(as.character(x)))

# Calculate min and max values for each variable
min_vals <- sapply(data_age, function(x) if (all(is.na(x))) NA else boxplot.stats(x)$stats[1])
max_vals <- sapply(data_age, function(x) if (all(is.na(x))) NA else boxplot.stats(x)$stats[5])

# Determine the Y-axis range to include the min and max labels
y_range <- range(c(min_vals, max_vals), na.rm = TRUE)
y_range <- c(y_range[1] - abs(y_range[2]) * 0.2, y_range[2] + abs(y_range[2]) * 0.2)

# Expand the plot area before creating the boxplot
par(mar = c(7, 6, 4, 12))  # Adjust the margin to make more space for the legend and min/max labels

# Create a boxplot
boxplot(data_age,
        main = "Boxplot of 2020 MED Differences by Age Group",
        col = rainbow(length(variables_age)),
        border = "black",
        las = 2,
        ylim = y_range)

# Add legend slightly to the right
legend("topright",
       legend = variables_age,
       fill = rainbow(length(variables_age)),
       bg = "transparent",
       bty = "n",
       xpd = TRUE,
       inset = c(-0.45, 0.05))  # Adjust inset values here

# Add text labels for min and max values
text(x = 1:length(variables_age), y = max_vals, labels = round(max_vals, 2), pos = 3, offset = 0.5)
text(x = 1:length(variables_age), y = min_vals, labels = round(min_vals, 2), pos = 1, offset = 0.5)

# Calculate means for each variable, removing NA values
mean_vals <- sapply(data_age, function(x) mean(x, na.rm = TRUE))

# Calculate Mean Absolute Deviation (MAD) for each variable
mad_vals <- sapply(data_age, function(x) mean(abs(x - mean(x, na.rm = TRUE)), na.rm = TRUE))

# Calculate positive and negative deviations for each variable
positive_deviations <- sapply(data_age, function(x) mean(x[x > mean(x, na.rm = TRUE)] - mean(x, na.rm = TRUE), na.rm = TRUE))
negative_deviations <- sapply(data_age, function(x) mean(mean(x, na.rm = TRUE) - x[x < mean(x, na.rm = TRUE)], na.rm = TRUE))

# Calculate percentage of values below the mean for each variable
percent_below_mean <- sapply(data_age, function(x) {
    mean(x < mean(x, na.rm = TRUE), na.rm = TRUE) * 100
})

# Create a data frame for the means, MAD, positive and negative deviations, and percentage below mean
mean_table <- data.frame(Box = variables_age, 
                         Mean = round(mean_vals, 2), 
                         MAD = round(mad_vals, 2),
                         Positive_Deviations = round(positive_deviations, 2),
                         Negative_Deviations = round(negative_deviations, 2),
                         Percent_Below_Mean = round(percent_below_mean, 2))

# Print the table
print(mean_table)






# Extract the variables from the dataset
variables_deg_dif <- c("2020_DEG1_F_dif", "2020_DEG1_M_dif", "2020_DEG1_T_dif",
                       "2020_DEG2_F_dif", "2020_DEG2_M_dif", "2020_DEG2_T_dif",
                       "2020_DEG3_F_dif", "2020_DEG3_M_dif", "2020_DEG3_T_dif")

# Extract data for boxplot
data_deg_dif <- data[variables_deg_dif]

# Create a boxplot
boxplot(data_deg_dif,
        main = "Boxplot of 2020 DEG Differences",
        col = rainbow(length(variables_deg_dif)),
        border = "black",
        las = 2)

# Add legend outside of the plot area
legend("topright",
       legend = variables_deg_dif,
       fill = rainbow(length(variables_deg_dif)),
       bg = "transparent",
       bty = "n",
       xpd = TRUE,
       inset = c(-0.35, 0.05))

# Calculate min and max values for each variable
min_vals <- sapply(data_deg_dif, function(x) boxplot.stats(x)$stats[1])
max_vals <- sapply(data_deg_dif, function(x) boxplot.stats(x)$stats[5])

# Add text labels for min and max values
text(x = 1:length(variables_deg_dif), y = max_vals, labels = round(max_vals, 2), pos = 3, offset = 0.5)
text(x = 1:length(variables_deg_dif), y = min_vals, labels = round(min_vals, 2), pos = 3, offset = 0.5)

# Expand the plot area
par(mar = c(5, 6, 4, 10))  # Adjust the margin to make more space for the legend and min/max labels

# Calculate means for each variable, removing NA values
mean_vals <- sapply(data_deg_dif, function(x) mean(x, na.rm = TRUE))

# Calculate Mean Absolute Deviation (MAD) for each variable
mad_vals <- sapply(data_deg_dif, function(x) mean(abs(x - mean(x, na.rm = TRUE)), na.rm = TRUE))

# Calculate positive and negative deviations for each variable
positive_deviations <- sapply(data_deg_dif, function(x) mean(x[x > mean(x, na.rm = TRUE)] - mean(x, na.rm = TRUE), na.rm = TRUE))
negative_deviations <- sapply(data_deg_dif, function(x) mean(mean(x, na.rm = TRUE) - x[x < mean(x, na.rm = TRUE)], na.rm = TRUE))

# Calculate percentage of values below the mean for each variable
percent_below_mean <- sapply(data_deg_dif, function(x) {
  mean(x < mean(x, na.rm = TRUE), na.rm = TRUE) * 100
})

# Create a data frame for the means, MAD, positive and negative deviations, and percentage below mean
mean_table <- data.frame(Box = variables_deg_dif, 
                         Mean = round(mean_vals, 2), 
                         MAD = round(mad_vals, 2),
                         Positive_Deviations = round(positive_deviations, 2),
                         Negative_Deviations = round(negative_deviations, 2),
                         Percent_Below_Mean = round(percent_below_mean, 2))

# Print the table
print(mean_table)





# Define the groups
groups <- c("2020_EU_18-55_M_dif", "2020_EU_18-55_F_dif", "2020_EU_18-55_T_dif",
            "2020_nonEU_18-55_M_dif", "2020_nonEU_18-55_F_dif", "2020_nonEU_18-55_T_dif",
            "2020_EU_M_elderly_dif", "2020_EU_F_elderly_dif", "2020_EU_T_elderly_dif",
            "2020_nonEU_M_elderly_dif", "2020_nonEU_F_elderly_dif", "2020_nonEU_T_elderly_dif",
            "2020_EU_M_T_dif", "2020_EU_F_T_dif", "2020_EU_T_T_dif", "2020_nonEU_M_T_dif", "2020_nonEU_F_T_dif", "2020_nonEU_T_T_dif")

# Extract data for boxplot
data_groups <- data[groups]

# Expand the plot area
par(mar = c(7, 6, 4, 12))  # Further adjust the margin to make more space for the legend and min/max labels

# Create a boxplot
boxplot(data_groups,
        main = "Boxplot of 2020 EU and non-EU Differences",
        col = rainbow(length(groups)),
        border = "black",
        las = 2)

# Calculate min and max values for each group
min_vals <- sapply(data_groups, function(x) min(x, na.rm = TRUE))
max_vals <- sapply(data_groups, function(x) max(x, na.rm = TRUE))

# Add min and max values to the plot with smaller font size
text(x = 1:length(groups), y = max_vals, labels = round(max_vals, 2), pos = 3, offset = 0.5, cex = 0.7)
text(x = 1:length(groups), y = min_vals, labels = round(min_vals, 2), pos = 3, offset = 0.5, cex = 0.7)

# Add legend outside of the plot area with smaller font size
legend("topright",
       legend = groups,
       fill = rainbow(length(groups)),
       bg = "transparent",
       bty = "n",
       xpd = TRUE,
       inset = c(-0.6, 0.05),
       cex = 0.7)

# Calculate means for each group, removing NA values
mean_vals_groups <- sapply(data_groups, function(x) mean(x, na.rm = TRUE))

# Calculate Mean Absolute Deviation (MAD) for each group
mad_vals_groups <- sapply(data_groups, function(x) mean(abs(x - mean(x, na.rm = TRUE)), na.rm = TRUE))

# Calculate positive and negative deviations for each group
positive_deviations_groups <- sapply(data_groups, function(x) mean(x[x > mean(x, na.rm = TRUE)] - mean(x, na.rm = TRUE), na.rm = TRUE))
negative_deviations_groups <- sapply(data_groups, function(x) mean(mean(x, na.rm = TRUE) - x[x < mean(x, na.rm = TRUE)], na.rm = TRUE))

# Calculate percentage of values below the mean for each group
percent_below_mean_groups <- sapply(data_groups, function(x) {
  mean(x < mean(x, na.rm = TRUE), na.rm = TRUE) * 100
})

# Create a data frame for the means, MAD, positive and negative deviations, and percentage below mean for groups
mean_table_groups <- data.frame(Group = groups, 
                                Mean = round(mean_vals_groups, 2), 
                                MAD = round(mad_vals_groups, 2),
                                Positive_Deviations = round(positive_deviations_groups, 2),
                                Negative_Deviations = round(negative_deviations_groups, 2),
                                Percent_Below_Mean = round(percent_below_mean_groups, 2))

# Print the table for groups
print(mean_table_groups)




# Define the variables
variables <- c("2020_OWN_F_LT18_dif", "2020_OWN_M_LT18_dif", "2020_OWN_T_LT18_dif",
               "2020_OWN_F_B18-64_dif", "2020_OWN_M_B18-64_dif", "2020_OWN_T_B18-64_dif",
               "2020_OWN_F_G65_dif", "2020_OWN_M_G65_dif", "2020_OWN_T_G65_dif",
               "2020_RENT_F_LT18_dif", "2020_RENT_M_LT18_dif", "2020_RENT_T_LT18_dif",
               "2020_RENT_F_B18-64_dif", "2020_RENT_M_B18-64_dif", "2020_RENT_T_B18-64_dif",
               "2020_RENT_F_G65_dif", "2020_RENT_M_G65_dif", "2020_RENT_T_G65_dif")

# Extract data for boxplot
data_variables <- data[variables]

# Calculate min and max values for each variable
min_vals <- sapply(data_variables, function(x) min(x, na.rm = TRUE))
max_vals <- sapply(data_variables, function(x) max(x, na.rm = TRUE))

# Determine the maximum range of values to set ylim
max_range <- max(max_vals, na.rm = TRUE) - min(min_vals, na.rm = TRUE)

# Expand the plot area to accommodate labels
par(mfrow = c(1, 1), mar = c(6, 6, 4, 8))  # Adjust the margin to balance space for labels and plot area
plot(1, type = "n", xlim = c(0, length(variables) + 1), ylim = c(min(min_vals, na.rm = TRUE) - 0.3 * max_range, max(max_vals, na.rm = TRUE) + 0.3 * max_range), 
     xaxt = "n", xlab = "", ylab = "Values", main = "Boxplot of 2020 Housing Variables Differences", cex.main = 1.2)

# Create boxplots with adjusted positions for each variable
for (i in seq_along(variables)) {
  boxplot(data_variables[[i]], at = i, add = TRUE, col = rainbow(length(variables))[i], border = "black", width = 0.5)
}

# Add min and max values to the plot with larger font size
text(x = 1:length(variables), y = max_vals, labels = round(max_vals, 2), pos = 3, offset = 0.5, cex = 0.8)
text(x = 1:length(variables), y = min_vals, labels = round(min_vals, 2), pos = 1, offset = 0.5, cex = 0.8)

# Add legend outside of the plot area with larger font size and adjusted inset
legend("topright",
       legend = variables,
       fill = rainbow(length(variables)),
       bg = "transparent",
       bty = "n",
       xpd = TRUE,
       inset = c(-0.35, 0.05),  # Adjust inset values here
       cex = 0.8)

# Calculate means for each variable, removing NA values
mean_vals_variables <- sapply(data_variables, function(x) mean(x, na.rm = TRUE))

# Calculate Mean Absolute Deviation (MAD) for each variable
mad_vals_variables <- sapply(data_variables, function(x) mean(abs(x - mean(x, na.rm = TRUE)), na.rm = TRUE))

# Calculate positive and negative deviations for each variable
positive_deviations_variables <- sapply(data_variables, function(x) mean(x[x > mean(x, na.rm = TRUE)] - mean(x, na.rm = TRUE), na.rm = TRUE))
negative_deviations_variables <- sapply(data_variables, function(x) mean(mean(x, na.rm = TRUE) - x[x < mean(x, na.rm = TRUE)], na.rm = TRUE))

# Calculate percentage of values below the mean for each variable
percent_below_mean_variables <- sapply(data_variables, function(x) {
  mean(x < mean(x, na.rm = TRUE), na.rm = TRUE) * 100
})

# Create a data frame for the means, MAD, positive and negative deviations, and percentage below mean for variables
mean_table_variables <- data.frame(Variable = variables, 
                                   Mean = round(mean_vals_variables, 2), 
                                   MAD = round(mad_vals_variables, 2),
                                   Positive_Deviations = round(positive_deviations_variables, 2),
                                   Negative_Deviations = round(negative_deviations_variables, 2),
                                   Percent_Below_Mean = round(percent_below_mean_variables, 2))

# Print the table for variables with larger font size
print(mean_table_variables, digits = 3, cex.main = 0.8)




# Define the variables
variables <- c("2020_SINGLE_MED_dif", "2020_SINGLE_1C_MED_dif", "2020_2AD_dif", 
               "2020_2AD_1C_dif", "2020_2AD_2C_dif", "2020_2AD_3C_dif", 
               "2020_3AD_dif", "2020_3AD_CH_dif")

# Extract data for boxplot
data_variables <- data[variables]

# Create a boxplot
boxplot(data_variables,
        main = "Boxplot of 2020 Health Variables Differences",
        col = rainbow(length(variables)),
        border = "black",
        las = 2)

# Calculate min and max values for each variable
min_vals <- sapply(data_variables, function(x) boxplot.stats(x)$stats[1])
max_vals <- sapply(data_variables, function(x) boxplot.stats(x)$stats[5])

# Add text labels for min and max values
text(x = 1:length(variables), y = max_vals, labels = round(max_vals, 2), pos = 3, offset = 0.5)
text(x = 1:length(variables), y = min_vals, labels = round(min_vals, 2), pos = 3, offset = 0.5)

# Add legend outside of the plot area
legend("topright",
       legend = variables,
       fill = rainbow(length(variables)),
       bg = "transparent",
       bty = "n",
       xpd = TRUE,
       inset = c(-0.5, 0.05))

# Expand the plot area
par(mar = c(5, 6, 4, 10))  # Adjust the margin to make more space for the legend and min/max labels

# Calculate means for each variable, removing NA values
mean_vals_variables <- sapply(data_variables, function(x) mean(x, na.rm = TRUE))

# Calculate Mean Absolute Deviation (MAD) for each variable
mad_vals_variables <- sapply(data_variables, function(x) mean(abs(x - mean(x, na.rm = TRUE)), na.rm = TRUE))

# Calculate positive and negative deviations for each variable
positive_deviations_variables <- sapply(data_variables, function(x) mean(x[x > mean(x, na.rm = TRUE)] - mean(x, na.rm = TRUE), na.rm = TRUE))
negative_deviations_variables <- sapply(data_variables, function(x) mean(mean(x, na.rm = TRUE) - x[x < mean(x, na.rm = TRUE)], na.rm = TRUE))

# Calculate percentage of values below the mean for each variable
percent_below_mean_variables <- sapply(data_variables, function(x) {
  mean(x < mean(x, na.rm = TRUE), na.rm = TRUE) * 100
})

# Create a data frame for the means, MAD, positive and negative deviations, and percentage below mean for variables
mean_table_variables <- data.frame(Variable = variables, 
                                   Mean = round(mean_vals_variables, 2), 
                                   MAD = round(mad_vals_variables, 2),
                                   Positive_Deviations = round(positive_deviations_variables, 2),
                                   Negative_Deviations = round(negative_deviations_variables, 2),
                                   Percent_Below_Mean = round(percent_below_mean_variables, 2))

# Print the table for variables
print(mean_table_variables)



