# Load the readxl package library(readxl) # Replace "your_file.xlsx" with the path to your Excel file # Specify NA values as blank cells and 0 data <- read_excel("C:\\[...]\\SM_SED_INCOMEDAT_FINAL.xlsx", na = c("",":", 0)) # View the first few rows of the imported data head(data) library(ggplot2) # Get the column names containing "_F_" in their header cols_F <- grep("_F_", colnames(data), value = TRUE) # Create a subset with variables containing "_F_" in their header subset_F <- data[, cols_F, with = FALSE] # Get the column names containing "_dif" in their header within subset_F cols_F_dif <- grep("_dif", colnames(subset_F), value = TRUE) # Create a subset with variables containing "_dif" in their header within subset_F subset_F_dif <- subset_F[, cols_F_dif, with = FALSE] # Get the column names containing "_M_" in their header cols_M <- grep("_M_", colnames(data), value = TRUE) # Create a subset with variables containing "_M_" in their header subset_M <- data[, cols_M, with = FALSE] # Get the column names containing "_dif" in their header within subset_M cols_M_dif <- grep("_dif", colnames(subset_M), value = TRUE) # Create a subset with variables containing "_dif" in their header within subset_M subset_M_dif <- subset_M[, cols_M_dif, with = FALSE] # Check the data types of subset_F_dif and subset_M_dif print(class(subset_F_dif)) print(class(subset_M_dif)) # Inspect the first few rows of subset_F_dif and subset_M_dif print(head(subset_F_dif)) print(head(subset_M_dif)) # Check for missing values print(sum(is.na(subset_F_dif))) print(sum(is.na(subset_M_dif))) # Set up the plot area par(mfrow = c(1, 1)) # Draw boxplot for subset_F_dif in orange with transparency boxplot(subset_F_dif, main = "Comparison of subset_F_dif and subset_M_dif", xlab = "Variables", ylab = "Values", col = rgb(1, 0.5, 0, alpha = 0.5), border = rgb(1, 0.5, 0, alpha = 0.5)) # Draw boxplot for subset_M_dif in purple with transparency boxplot(subset_M_dif, add = TRUE, col = rgb(0.5, 0, 0.5, alpha = 0.5), border = rgb(0.5, 0, 0.5, alpha = 0.5)) # Add a legend outside of the plot area legend("topright", legend = c("subset_F_dif", "subset_M_dif"), fill = c(rgb(1, 0.5, 0, alpha = 0.5), rgb(0.5, 0, 0.5, alpha = 0.5)), bty = "n") # Expand the plot area par(mar = c(5, 6, 4, 8)) # Adjust the margin to make space for the legend # Extract the variables from the dataset variables_age <- c("LT18_2020_F_MED_", "LT18_2020_M_MED_", "LT18_2020_T_MED_", "B18-64_2020_F_MED_", "B18-64_2020_M_MED_", "B18-64_2020_T_MED_", "G65_2020_F_MED_", "G65_2020_M_MED_", "G65_2020_T_MED_") # Ensure all data columns are numeric data_age <- data[variables_age] data_age[] <- lapply(data_age, function(x) as.numeric(as.character(x))) # Calculate min and max values for each variable min_vals <- sapply(data_age, function(x) if (all(is.na(x))) NA else boxplot.stats(x)$stats[1]) max_vals <- sapply(data_age, function(x) if (all(is.na(x))) NA else boxplot.stats(x)$stats[5]) # Determine the Y-axis range to include the min and max labels y_range <- range(c(min_vals, max_vals), na.rm = TRUE) y_range <- c(y_range[1] - abs(y_range[2]) * 0.2, y_range[2] + abs(y_range[2]) * 0.2) # Expand the plot area before creating the boxplot par(mar = c(7, 6, 4, 12)) # Adjust the margin to make more space for the legend and min/max labels # Create a boxplot boxplot(data_age, main = "Boxplot of 2020 MED Differences by Age Group", col = rainbow(length(variables_age)), border = "black", las = 2, ylim = y_range) # Add legend slightly to the right legend("topright", legend = variables_age, fill = rainbow(length(variables_age)), bg = "transparent", bty = "n", xpd = TRUE, inset = c(-0.45, 0.05)) # Adjust inset values here # Add text labels for min and max values text(x = 1:length(variables_age), y = max_vals, labels = round(max_vals, 2), pos = 3, offset = 0.5) text(x = 1:length(variables_age), y = min_vals, labels = round(min_vals, 2), pos = 1, offset = 0.5) # Calculate means for each variable, removing NA values mean_vals <- sapply(data_age, function(x) mean(x, na.rm = TRUE)) # Calculate Mean Absolute Deviation (MAD) for each variable mad_vals <- sapply(data_age, function(x) mean(abs(x - mean(x, na.rm = TRUE)), na.rm = TRUE)) # Calculate positive and negative deviations for each variable positive_deviations <- sapply(data_age, function(x) mean(x[x > mean(x, na.rm = TRUE)] - mean(x, na.rm = TRUE), na.rm = TRUE)) negative_deviations <- sapply(data_age, function(x) mean(mean(x, na.rm = TRUE) - x[x < mean(x, na.rm = TRUE)], na.rm = TRUE)) # Calculate percentage of values below the mean for each variable percent_below_mean <- sapply(data_age, function(x) { mean(x < mean(x, na.rm = TRUE), na.rm = TRUE) * 100 }) # Create a data frame for the means, MAD, positive and negative deviations, and percentage below mean mean_table <- data.frame(Box = variables_age, Mean = round(mean_vals, 2), MAD = round(mad_vals, 2), Positive_Deviations = round(positive_deviations, 2), Negative_Deviations = round(negative_deviations, 2), Percent_Below_Mean = round(percent_below_mean, 2)) # Print the table print(mean_table) # Extract the variables from the dataset variables_deg_dif <- c("2020_DEG1_F_dif", "2020_DEG1_M_dif", "2020_DEG1_T_dif", "2020_DEG2_F_dif", "2020_DEG2_M_dif", "2020_DEG2_T_dif", "2020_DEG3_F_dif", "2020_DEG3_M_dif", "2020_DEG3_T_dif") # Extract data for boxplot data_deg_dif <- data[variables_deg_dif] # Create a boxplot boxplot(data_deg_dif, main = "Boxplot of 2020 DEG Differences", col = rainbow(length(variables_deg_dif)), border = "black", las = 2) # Add legend outside of the plot area legend("topright", legend = variables_deg_dif, fill = rainbow(length(variables_deg_dif)), bg = "transparent", bty = "n", xpd = TRUE, inset = c(-0.35, 0.05)) # Calculate min and max values for each variable min_vals <- sapply(data_deg_dif, function(x) boxplot.stats(x)$stats[1]) max_vals <- sapply(data_deg_dif, function(x) boxplot.stats(x)$stats[5]) # Add text labels for min and max values text(x = 1:length(variables_deg_dif), y = max_vals, labels = round(max_vals, 2), pos = 3, offset = 0.5) text(x = 1:length(variables_deg_dif), y = min_vals, labels = round(min_vals, 2), pos = 3, offset = 0.5) # Expand the plot area par(mar = c(5, 6, 4, 10)) # Adjust the margin to make more space for the legend and min/max labels # Calculate means for each variable, removing NA values mean_vals <- sapply(data_deg_dif, function(x) mean(x, na.rm = TRUE)) # Calculate Mean Absolute Deviation (MAD) for each variable mad_vals <- sapply(data_deg_dif, function(x) mean(abs(x - mean(x, na.rm = TRUE)), na.rm = TRUE)) # Calculate positive and negative deviations for each variable positive_deviations <- sapply(data_deg_dif, function(x) mean(x[x > mean(x, na.rm = TRUE)] - mean(x, na.rm = TRUE), na.rm = TRUE)) negative_deviations <- sapply(data_deg_dif, function(x) mean(mean(x, na.rm = TRUE) - x[x < mean(x, na.rm = TRUE)], na.rm = TRUE)) # Calculate percentage of values below the mean for each variable percent_below_mean <- sapply(data_deg_dif, function(x) { mean(x < mean(x, na.rm = TRUE), na.rm = TRUE) * 100 }) # Create a data frame for the means, MAD, positive and negative deviations, and percentage below mean mean_table <- data.frame(Box = variables_deg_dif, Mean = round(mean_vals, 2), MAD = round(mad_vals, 2), Positive_Deviations = round(positive_deviations, 2), Negative_Deviations = round(negative_deviations, 2), Percent_Below_Mean = round(percent_below_mean, 2)) # Print the table print(mean_table) # Define the groups groups <- c("2020_EU_18-55_M_dif", "2020_EU_18-55_F_dif", "2020_EU_18-55_T_dif", "2020_nonEU_18-55_M_dif", "2020_nonEU_18-55_F_dif", "2020_nonEU_18-55_T_dif", "2020_EU_M_elderly_dif", "2020_EU_F_elderly_dif", "2020_EU_T_elderly_dif", "2020_nonEU_M_elderly_dif", "2020_nonEU_F_elderly_dif", "2020_nonEU_T_elderly_dif", "2020_EU_M_T_dif", "2020_EU_F_T_dif", "2020_EU_T_T_dif", "2020_nonEU_M_T_dif", "2020_nonEU_F_T_dif", "2020_nonEU_T_T_dif") # Extract data for boxplot data_groups <- data[groups] # Expand the plot area par(mar = c(7, 6, 4, 12)) # Further adjust the margin to make more space for the legend and min/max labels # Create a boxplot boxplot(data_groups, main = "Boxplot of 2020 EU and non-EU Differences", col = rainbow(length(groups)), border = "black", las = 2) # Calculate min and max values for each group min_vals <- sapply(data_groups, function(x) min(x, na.rm = TRUE)) max_vals <- sapply(data_groups, function(x) max(x, na.rm = TRUE)) # Add min and max values to the plot with smaller font size text(x = 1:length(groups), y = max_vals, labels = round(max_vals, 2), pos = 3, offset = 0.5, cex = 0.7) text(x = 1:length(groups), y = min_vals, labels = round(min_vals, 2), pos = 3, offset = 0.5, cex = 0.7) # Add legend outside of the plot area with smaller font size legend("topright", legend = groups, fill = rainbow(length(groups)), bg = "transparent", bty = "n", xpd = TRUE, inset = c(-0.6, 0.05), cex = 0.7) # Calculate means for each group, removing NA values mean_vals_groups <- sapply(data_groups, function(x) mean(x, na.rm = TRUE)) # Calculate Mean Absolute Deviation (MAD) for each group mad_vals_groups <- sapply(data_groups, function(x) mean(abs(x - mean(x, na.rm = TRUE)), na.rm = TRUE)) # Calculate positive and negative deviations for each group positive_deviations_groups <- sapply(data_groups, function(x) mean(x[x > mean(x, na.rm = TRUE)] - mean(x, na.rm = TRUE), na.rm = TRUE)) negative_deviations_groups <- sapply(data_groups, function(x) mean(mean(x, na.rm = TRUE) - x[x < mean(x, na.rm = TRUE)], na.rm = TRUE)) # Calculate percentage of values below the mean for each group percent_below_mean_groups <- sapply(data_groups, function(x) { mean(x < mean(x, na.rm = TRUE), na.rm = TRUE) * 100 }) # Create a data frame for the means, MAD, positive and negative deviations, and percentage below mean for groups mean_table_groups <- data.frame(Group = groups, Mean = round(mean_vals_groups, 2), MAD = round(mad_vals_groups, 2), Positive_Deviations = round(positive_deviations_groups, 2), Negative_Deviations = round(negative_deviations_groups, 2), Percent_Below_Mean = round(percent_below_mean_groups, 2)) # Print the table for groups print(mean_table_groups) # Define the variables variables <- c("2020_OWN_F_LT18_dif", "2020_OWN_M_LT18_dif", "2020_OWN_T_LT18_dif", "2020_OWN_F_B18-64_dif", "2020_OWN_M_B18-64_dif", "2020_OWN_T_B18-64_dif", "2020_OWN_F_G65_dif", "2020_OWN_M_G65_dif", "2020_OWN_T_G65_dif", "2020_RENT_F_LT18_dif", "2020_RENT_M_LT18_dif", "2020_RENT_T_LT18_dif", "2020_RENT_F_B18-64_dif", "2020_RENT_M_B18-64_dif", "2020_RENT_T_B18-64_dif", "2020_RENT_F_G65_dif", "2020_RENT_M_G65_dif", "2020_RENT_T_G65_dif") # Extract data for boxplot data_variables <- data[variables] # Calculate min and max values for each variable min_vals <- sapply(data_variables, function(x) min(x, na.rm = TRUE)) max_vals <- sapply(data_variables, function(x) max(x, na.rm = TRUE)) # Determine the maximum range of values to set ylim max_range <- max(max_vals, na.rm = TRUE) - min(min_vals, na.rm = TRUE) # Expand the plot area to accommodate labels par(mfrow = c(1, 1), mar = c(6, 6, 4, 8)) # Adjust the margin to balance space for labels and plot area plot(1, type = "n", xlim = c(0, length(variables) + 1), ylim = c(min(min_vals, na.rm = TRUE) - 0.3 * max_range, max(max_vals, na.rm = TRUE) + 0.3 * max_range), xaxt = "n", xlab = "", ylab = "Values", main = "Boxplot of 2020 Housing Variables Differences", cex.main = 1.2) # Create boxplots with adjusted positions for each variable for (i in seq_along(variables)) { boxplot(data_variables[[i]], at = i, add = TRUE, col = rainbow(length(variables))[i], border = "black", width = 0.5) } # Add min and max values to the plot with larger font size text(x = 1:length(variables), y = max_vals, labels = round(max_vals, 2), pos = 3, offset = 0.5, cex = 0.8) text(x = 1:length(variables), y = min_vals, labels = round(min_vals, 2), pos = 1, offset = 0.5, cex = 0.8) # Add legend outside of the plot area with larger font size and adjusted inset legend("topright", legend = variables, fill = rainbow(length(variables)), bg = "transparent", bty = "n", xpd = TRUE, inset = c(-0.35, 0.05), # Adjust inset values here cex = 0.8) # Calculate means for each variable, removing NA values mean_vals_variables <- sapply(data_variables, function(x) mean(x, na.rm = TRUE)) # Calculate Mean Absolute Deviation (MAD) for each variable mad_vals_variables <- sapply(data_variables, function(x) mean(abs(x - mean(x, na.rm = TRUE)), na.rm = TRUE)) # Calculate positive and negative deviations for each variable positive_deviations_variables <- sapply(data_variables, function(x) mean(x[x > mean(x, na.rm = TRUE)] - mean(x, na.rm = TRUE), na.rm = TRUE)) negative_deviations_variables <- sapply(data_variables, function(x) mean(mean(x, na.rm = TRUE) - x[x < mean(x, na.rm = TRUE)], na.rm = TRUE)) # Calculate percentage of values below the mean for each variable percent_below_mean_variables <- sapply(data_variables, function(x) { mean(x < mean(x, na.rm = TRUE), na.rm = TRUE) * 100 }) # Create a data frame for the means, MAD, positive and negative deviations, and percentage below mean for variables mean_table_variables <- data.frame(Variable = variables, Mean = round(mean_vals_variables, 2), MAD = round(mad_vals_variables, 2), Positive_Deviations = round(positive_deviations_variables, 2), Negative_Deviations = round(negative_deviations_variables, 2), Percent_Below_Mean = round(percent_below_mean_variables, 2)) # Print the table for variables with larger font size print(mean_table_variables, digits = 3, cex.main = 0.8) # Define the variables variables <- c("2020_SINGLE_MED_dif", "2020_SINGLE_1C_MED_dif", "2020_2AD_dif", "2020_2AD_1C_dif", "2020_2AD_2C_dif", "2020_2AD_3C_dif", "2020_3AD_dif", "2020_3AD_CH_dif") # Extract data for boxplot data_variables <- data[variables] # Create a boxplot boxplot(data_variables, main = "Boxplot of 2020 Health Variables Differences", col = rainbow(length(variables)), border = "black", las = 2) # Calculate min and max values for each variable min_vals <- sapply(data_variables, function(x) boxplot.stats(x)$stats[1]) max_vals <- sapply(data_variables, function(x) boxplot.stats(x)$stats[5]) # Add text labels for min and max values text(x = 1:length(variables), y = max_vals, labels = round(max_vals, 2), pos = 3, offset = 0.5) text(x = 1:length(variables), y = min_vals, labels = round(min_vals, 2), pos = 3, offset = 0.5) # Add legend outside of the plot area legend("topright", legend = variables, fill = rainbow(length(variables)), bg = "transparent", bty = "n", xpd = TRUE, inset = c(-0.5, 0.05)) # Expand the plot area par(mar = c(5, 6, 4, 10)) # Adjust the margin to make more space for the legend and min/max labels # Calculate means for each variable, removing NA values mean_vals_variables <- sapply(data_variables, function(x) mean(x, na.rm = TRUE)) # Calculate Mean Absolute Deviation (MAD) for each variable mad_vals_variables <- sapply(data_variables, function(x) mean(abs(x - mean(x, na.rm = TRUE)), na.rm = TRUE)) # Calculate positive and negative deviations for each variable positive_deviations_variables <- sapply(data_variables, function(x) mean(x[x > mean(x, na.rm = TRUE)] - mean(x, na.rm = TRUE), na.rm = TRUE)) negative_deviations_variables <- sapply(data_variables, function(x) mean(mean(x, na.rm = TRUE) - x[x < mean(x, na.rm = TRUE)], na.rm = TRUE)) # Calculate percentage of values below the mean for each variable percent_below_mean_variables <- sapply(data_variables, function(x) { mean(x < mean(x, na.rm = TRUE), na.rm = TRUE) * 100 }) # Create a data frame for the means, MAD, positive and negative deviations, and percentage below mean for variables mean_table_variables <- data.frame(Variable = variables, Mean = round(mean_vals_variables, 2), MAD = round(mad_vals_variables, 2), Positive_Deviations = round(positive_deviations_variables, 2), Negative_Deviations = round(negative_deviations_variables, 2), Percent_Below_Mean = round(percent_below_mean_variables, 2)) # Print the table for variables print(mean_table_variables)