# Load the readxl package
library(readxl)

# Replace "your_file.xlsx" with the path to your Excel file
# Specify NA values as blank cells and 0
data <- read_excel("[..]//SM_ExpPat_HH_Expenditure.xlsx", na = c("",":", 0))

# View the first few rows of the imported data
head(data)

# Data frame contains columns CP04_2015 and CP045_2015
data$new_var <- data$CP04_2020 - data$CP045_2020

# Create variable "basics" as the sum of CP01_2020, CP02_2020, and CP04_2020
data$basics <- data$CP01_2020 + data$CP02_2020 + data$CP04_2020


library(ggplot2)


# You want to analyze is named 'column_name'
unique_characters <- unique(data$column_name)
num_unique_characters <- length(unique_characters)

print(num_unique_characters)

print(unique_characters)



# Load required libraries
library(ggplot2)
library(dplyr)

# Define color palette
color_palette <- c(
  "LU" = "firebrick",
  "AT" = "red",
  "BE" = "salmon",
  "BG" = "tomato",
  "CY" = "lightblue",
  "CZ" = "skyblue",
  "DE" = "darkblue",
  "DK" = "royalblue",
  "EA" = "deepskyblue",
  "EA12" = "black",
  "EA13" = "gray40",
  "EA17" = "gray60",
  "EA18" = "gray80",
  "EE" = "white",
  "EEA28" = "yellow",
  "EEA30_2007" = "yellow",
  "EFTA" = "darkgreen",
  "EL" = "green",
  "ES" = "limegreen",
  "EU15" = "palegreen",
  "EU25" = "purple",        
  "EU27_2007" = "orange",   
  "EU27_2020" = "orchid",   
  "FI" = "cyan",            
  "FR" = "magenta",         
  "HR" = "violet",          
  "HU" = "brown",           
  "IE" = "tan",             
  "IT" = "khaki",           
  "LT" = "gold",            
  "LV" = "purple",          
  "ME" = "beige",           
  "MK" = "navy",            
  "MT" = "lavender",        
  "NL" = "pink",            
  "NO" = "maroon",          
  "PL" = "coral",           
  "PT" = "azure",           
  "RO" = "ivory",           
  "RS" = "aquamarine",      
  "SE" = "blue",            
  "SI" = "turquoise",       
  "SK" = "pink",            
  "TR" = "salmon",          
  "UK" = "tomato",          
  "XK" = "red"              
)

# Calculate "Food"
data$Food <- data$CP01_2020 + data$CP02_2020

# Remove rows with missing data in CP045_2020 or Food
data_clean <- data %>% filter(!is.na(CP045_2020) & !is.na(Food))

# Define variables to correlate
variables_2020 <- c("Food")

# Pearson test for Food and CP045_2020
pearson_test_2020 <- cor.test(data_clean$Food, data_clean$CP045_2020, method = "pearson")
# Spearman test for Food and CP045_2020
spearman_test_2020 <- cor.test(data_clean$Food, data_clean$CP045_2020, method = "spearman")

# Linear regression for Food and CP045_2020
lm_model_2020 <- lm(Food ~ CP045_2020, data = data_clean)

# Predict estimate interval for Food when CP045_2020 is 100
prediction_interval <- predict(lm_model_2020, interval = "prediction", newdata = data.frame(CP045_2020 = 100))

# Print correlation coefficients and tests
print(paste("Correlation coefficient (Pearson) for Food and CP045_2020:", pearson_test_2020$estimate))
print(paste("Correlation coefficient (Spearman) for Food and CP045_2020:", spearman_test_2020$estimate))

# Print Pearson test results
print("Pearson Test Results for Food and CP045_2020:")
print(pearson_test_2020)

# Print Spearman test results
print("Spearman Test Results for Food and CP045_2020:")
print(spearman_test_2020)

# Print regression summary
print(summary(lm_model_2020))

# Print prediction interval
print("Prediction Interval for Food when CP045_2020 = 100:")
print(prediction_interval)

# Fit linear regression model
lm_model <- lm(Food ~ CP045_2020, data = data_clean)

# Predict value of "Food" when "CP045_2020" is 100
predicted_value <- predict(lm_model, newdata = data.frame(CP045_2020 = 100))

# Calculate prediction interval for "Food" when "CP045_2020" is 100
prediction_interval <- predict(lm_model, newdata = data.frame(CP045_2020 = 100), interval = "prediction", level = 0.95)

# Print predicted value and interval
print("Predicted Value of Food when CP045_2020 is 100:")
print(predicted_value)

print("Prediction Interval for Food when CP045_2020 is 100 (95% confidence level):")
print(prediction_interval)

# Fit linear regression model
lm_model <- lm(Food ~ CP045_2020, data = data_clean)

# Print formula
print("Linear Regression Formula:")
print(summary(lm_model)$call)

# Print R-squared value
print("R-squared:")
print(summary(lm_model)$r.squared)

# Print p-value
print("p-value:")
print(summary(lm_model)$coefficients["CP045_2020", "Pr(>|t|)"])

# Create scatter plot for Food and CP045_2020
scatter_plot_2020 <- ggplot(data_clean, aes(x = CP045_2020, y = Food, color = V)) +
  geom_point() +
  geom_vline(xintercept = 100, linetype = "dashed", color = "red") +
  scale_color_manual(values = color_palette) +
  labs(x = "CP045_2020", y = "Food", color = "Country") +
  ggtitle("Scatter Plot of Food vs CP045_2020") +
  theme_minimal()

# Print scatter plot
print(scatter_plot_2020)


# Load required libraries
library(ggplot2)

# Define color palette
color_palette <- c(
  A_GE3 = "firebrick",
  A_GE3_DCH = "red",
  A2 = "salmon",
  A2_CDH = "tomato",
  A1 = "lightblue",
  A1_DCH = "skyblue",
  DEG1 = "darkblue",
  DEG2 = "royalblue",
  DEG3 = "deepskyblue",
  QUINTILE1 = "black",
  QUINTILE2 = "gray40",
  QUINTILE3 = "gray60",
  QUINTILE4 = "gray80",
  QUINTILE5 = "white",
  UNK = "yellow",
  UNKUNK_Italy = "yellow",
  Y_GE60 = "darkgreen",
  Y45_59 = "green",
  Y30_44 = "limegreen",
  Y_LT30 = "palegreen"
)

# Calculate "Food"
data$Food <- data$CP01_2020 + data$CP02_2020

# Define variables to correlate
variables_2020 <- c("Food")

# Pearson test for Food and CP045_2020
pearson_test_2020 <- cor.test(data$Food, data$CP045_2020, method = "pearson")
# Spearman test for Food and CP045_2020
spearman_test_2020 <- cor.test(data$Food, data$CP045_2020, method = "spearman")

# Linear regression for Food and CP045_2020
lm_model_2020 <- lm(data$Food ~ data$CP045_2020)

# Predict estimate interval for Food when CP045_2020 is 100
prediction_interval <- predict(lm_model_2020, interval = "prediction", newdata = data.frame(CP045_2020 = 100))

# Print correlation coefficients and tests
print(paste("Correlation coefficient (Pearson) for Food and CP045_2020:", pearson_test_2020$estimate))
print(paste("Correlation coefficient (Spearman) for Food and CP045_2020:", spearman_test_2020$estimate))

# Print Pearson test results
print("Pearson Test Results for Food and CP045_2020:")
print(pearson_test_2020)

# Print Spearman test results
print("Spearman Test Results for Food and CP045_2020:")
print(spearman_test_2020)

# Print regression summary
print(summary(lm_model_2020))

# Print prediction interval
print("Prediction Interval for Food when CP045_2020 = 100:")
print(prediction_interval)

# Fit linear regression model
lm_model <- lm(Food ~ CP045_2020, data = data)

# Predict value of "Food" when "CP045_2020" is 100
predicted_value <- predict(lm_model, newdata = data.frame(CP045_2020 = 100))

# Calculate prediction interval for "Food" when "CP045_2020" is 100
prediction_interval <- predict(lm_model, newdata = data.frame(CP045_2020 = 100), interval = "prediction", level = 0.95)

# Print predicted value and interval
print("Predicted Value of Food when CP045_2020 is 100:")
print(predicted_value)

print("Prediction Interval for Food when CP045_2020 is 100 (95% confidence level):")
print(prediction_interval)

# Fit linear regression model
lm_model <- lm(Food ~ CP045_2020, data = data)

# Print formula
print("Linear Regression Formula:")
print(summary(lm_model)$call)

# Print R-squared value
print("R-squared:")
print(summary(lm_model)$r.squared)

# Print p-value
print("p-value:")
print(summary(lm_model)$coefficients["CP045_2020", "Pr(>|t|)"])

# Create scatter plot for Food and CP045_2020
scatter_plot_2020 <- ggplot(data, aes(x = CP045_2020, y = Food, color = A)) +
  geom_point() +
  geom_vline(xintercept = 100, linetype = "dashed", color = "red") +
  scale_color_manual(values = color_palette) +
  labs(x = "CP045_2020", y = "Food", color = "A") +
  ggtitle("Scatter Plot of Food vs CP045_2020") +
  theme_minimal()

# Print scatter plot
print(scatter_plot_2020)





# Create scatter plot for Food and CP045_2020
scatter_plot_2020 <- ggplot(data, aes(x = CP045_2020, y = Food, color = V)) +
  geom_point() +
  geom_vline(xintercept = 100, linetype = "dashed", color = "red") +
  scale_color_manual(values = color_palette) +
  labs(x = "CP045_2020", y = "Food", color = "Country") +
  ggtitle("Scatter Plot of Food vs CP045_2020") +
  theme_minimal()

# Create normal probability plot
normal_prob_plot <- qqnorm(residuals(lm_model_2020), main = "Normal Probability Plot of Residuals")
qqline(residuals(lm_model_2020))

# Print scatter plot and normal probability plot
print(scatter_plot_2020)
print(normal_prob_plot)




# Load required libraries
library(ggplot2)
library(dplyr)

# Calculate the number of items classified by type "A"
num_items_by_A <- data %>%
  filter(Food > 181.1764) %>%
  count(A)

# Sort the counts in descending order and extract the top 5 categories
top_5_categories <- num_items_by_A %>%
  arrange(desc(n)) %>%
  head(5) %>%
  pull(A)

# Create a vector of colors, setting red for the top 5 categories and gray for the rest
slice_colors <- ifelse(num_items_by_A$A %in% top_5_categories, "red", "gray")

# Create a pie chart with custom colors
pie(num_items_by_A$n, labels = num_items_by_A$A, main = "Number of Items by Type A (Above 181.1764)", col = slice_colors)

# Load required libraries
library(ggplot2)
library(dplyr)

# Calculate the number of items classified by country (variable V)
num_items_by_country <- data %>%
  filter(Food > 181.1764) %>%
  count(V)

# Sort the counts in descending order and extract the top 5 categories
top_5_countries <- num_items_by_country %>%
  arrange(desc(n)) %>%
  head(5) %>%
  pull(V)

# Create a vector of colors, setting red for the top 5 categories and gray for the rest
slice_colors <- ifelse(num_items_by_country$V %in% top_5_countries, "red", "gray")

# Create a pie chart with custom colors
pie(num_items_by_country$n, labels = num_items_by_country$V, main = "Number of Items by Country (Above 181.1764)", col = slice_colors)

# Load required libraries
library(ggplot2)

# Define color palette
color_palette <- c(
  "LU" = "firebrick",
  "AT" = "red",
  "BE" = "salmon",
  "BG" = "tomato",
  "CY" = "lightblue",
  "CZ" = "skyblue",
  "DE" = "darkblue",
  "DK" = "royalblue",
  "EA" = "deepskyblue",
  "EA12" = "black",
  "EA13" = "gray40",
  "EA17" = "gray60",
  "EA18" = "gray80",
  "EE" = "white",
  "EEA28" = "yellow",
  "EEA30_2007" = "yellow",
  "EFTA" = "darkgreen",
  "EL" = "green",
  "ES" = "limegreen",
  "EU15" = "palegreen",
  "EU25" = "purple",        
  "EU27_2007" = "orange",   
  "EU27_2020" = "orchid",   
  "FI" = "cyan",            
  "FR" = "magenta",         
  "HR" = "violet",          
  "HU" = "brown",           
  "IE" = "tan",             
  "IT" = "khaki",           
  "LT" = "gold",            
  "LV" = "purple",          
  "ME" = "beige",           
  "MK" = "navy",            
  "MT" = "lavender",        
  "NL" = "pink",            
  "NO" = "maroon",          
  "PL" = "coral",           
  "PT" = "azure",           
  "RO" = "ivory",           
  "RS" = "aquamarine",      
  "SE" = "blue",            
  "SI" = "turquoise",       
  "SK" = "pink",            
  "TR" = "salmon",          
  "UK" = "tomato",          
  "XK" = "red"              
)


# Define variables to correlate
variables_2020 <- c("CP04_2020")

# Pearson test for CP04_2020 and CP045_2020
pearson_test_2020 <- cor.test(data$CP04_2020, data$CP045_2020, method = "pearson")
# Spearman test for CP04_2020 and CP045_2020
spearman_test_2020 <- cor.test(data$CP04_2020, data$CP045_2020, method = "spearman")

# Linear regression for CP04_2020 and CP045_2020
lm_model_2020 <- lm(data$CP04_2020 ~ data$CP045_2020)

# Predict estimate interval for CP04_2020 when CP045_2020 is 100
prediction_interval <- predict(lm_model_2020, interval = "prediction", newdata = data.frame(CP045_2020 = 100))

# Print correlation coefficients and tests
print(paste("Correlation coefficient (Pearson) for CP04_2020 and CP045_2020:", pearson_test_2020$estimate))
print(paste("Correlation coefficient (Spearman) for CP04_2020 and CP045_2020:", spearman_test_2020$estimate))

# Print Pearson test results
print("Pearson Test Results for CP04_2020 and CP045_2020:")
print(pearson_test_2020)

# Print Spearman test results
print("Spearman Test Results for CP04_2020 and CP045_2020:")
print(spearman_test_2020)

# Print regression summary
print(summary(lm_model_2020))

# Print prediction interval
print("Prediction Interval for CP04_2020 when CP045_2020 = 100:")
print(prediction_interval)

# Fit linear regression model
lm_model <- lm(CP04_2020 ~ CP045_2020, data = data)

# Predict value of "CP04_2020" when "CP045_2020" is 100
predicted_value <- predict(lm_model, newdata = data.frame(CP045_2020 = 100))

# Calculate prediction interval for "CP04_2020" when "CP045_2020" is 100
prediction_interval <- predict(lm_model, newdata = data.frame(CP045_2020 = 100), interval = "prediction", level = 0.95)

# Print predicted value and interval
print("Predicted Value of CP04_2020 when CP045_2020 is 100:")
print(predicted_value)

print("Prediction Interval for CP04_2020 when CP045_2020 is 100 (95% confidence level):")
print(prediction_interval)

# Fit linear regression model
lm_model <- lm(CP04_2020 ~ CP045_2020, data = data)

# Print formula
print("Linear Regression Formula:")
print(summary(lm_model)$call)

# Print R-squared value
print("R-squared:")
print(summary(lm_model)$r.squared)

# Print p-value
print("p-value:")
print(summary(lm_model)$coefficients["CP045_2020", "Pr(>|t|)"])

# Create scatter plot for CP04_2020 and CP045_2020
scatter_plot_2020 <- ggplot(data, aes(x = CP045_2020, y = CP04_2020, color = V)) +
  geom_point() +
  geom_vline(xintercept = 100, linetype = "dashed", color = "red") +
  scale_color_manual(values = color_palette) +
  labs(x = "CP045_2020", y = "CP04_2020", color = "Country") +
  ggtitle("Scatter Plot of CP04_2020 vs CP045_2020") +
  theme_minimal()

# Print scatter plot
print(scatter_plot_2020)


# Load required libraries
library(ggplot2)

# Define color palette
color_palette <- c(
  A_GE3 = "firebrick",
  A_GE3_DCH = "red",
  A2 = "salmon",
  A2_CDH = "tomato",
  A1 = "lightblue",
  A1_DCH = "skyblue",
  DEG1 = "darkblue",
  DEG2 = "royalblue",
  DEG3 = "deepskyblue",
  QUINTILE1 = "black",
  QUINTILE2 = "gray40",
  QUINTILE3 = "gray60",
  QUINTILE4 = "gray80",
  QUINTILE5 = "white",
  UNK = "yellow",
  UNKUNK_Italy = "yellow",
  Y_GE60 = "darkgreen",
  Y45_59 = "green",
  Y30_44 = "limegreen",
  Y_LT30 = "palegreen"
)


# Define variables to correlate
variables_2020 <- c("CP04_2020")

# Pearson test for CP04_2020 and CP045_2020
pearson_test_2020 <- cor.test(data$CP04_2020, data$CP045_2020, method = "pearson")
# Spearman test for CP04_2020 and CP045_2020
spearman_test_2020 <- cor.test(data$CP04_2020, data$CP045_2020, method = "spearman")

# Linear regression for CP04_2020 and CP045_2020
lm_model_2020 <- lm(data$CP04_2020 ~ data$CP045_2020)

# Predict estimate interval for CP04_2020 when CP045_2020 is 100
prediction_interval <- predict(lm_model_2020, interval = "prediction", newdata = data.frame(CP045_2020 = 100))

# Print correlation coefficients and tests
print(paste("Correlation coefficient (Pearson) for CP04_2020 and CP045_2020:", pearson_test_2020$estimate))
print(paste("Correlation coefficient (Spearman) for CP04_2020 and CP045_2020:", spearman_test_2020$estimate))

# Print Pearson test results
print("Pearson Test Results for CP04_2020 and CP045_2020:")
print(pearson_test_2020)

# Print Spearman test results
print("Spearman Test Results for CP04_2020 and CP045_2020:")
print(spearman_test_2020)

# Print regression summary
print(summary(lm_model_2020))

# Print prediction interval
print("Prediction Interval for CP04_2020 when CP045_2020 = 100:")
print(prediction_interval)

# Fit linear regression model
lm_model <- lm(CP04_2020 ~ CP045_2020, data = data)

# Predict value of "CP04_2020" when "CP045_2020" is 100
predicted_value <- predict(lm_model, newdata = data.frame(CP045_2020 = 100))

# Calculate prediction interval for "CP04_2020" when "CP045_2020" is 100
prediction_interval <- predict(lm_model, newdata = data.frame(CP045_2020 = 100), interval = "prediction", level = 0.95)

# Print predicted value and interval
print("Predicted Value of CP04_2020 when CP045_2020 is 100:")
print(predicted_value)

print("Prediction Interval for CP04_2020 when CP045_2020 is 100 (95% confidence level):")
print(prediction_interval)

# Fit linear regression model
lm_model <- lm(CP04_2020 ~ CP045_2020, data = data)

# Print formula
print("Linear Regression Formula:")
print(summary(lm_model)$call)

# Print R-squared value
print("R-squared:")
print(summary(lm_model)$r.squared)

# Print p-value
print("p-value:")
print(summary(lm_model)$coefficients["CP045_2020", "Pr(>|t|)"])

# Create scatter plot for CP04_2020 and CP045_2020
scatter_plot_2020 <- ggplot(data, aes(x = CP045_2020, y = CP04_2020, color = A)) +
  geom_point() +
  geom_vline(xintercept = 100, linetype = "dashed", color = "red") +
  scale_color_manual(values = color_palette) +
  labs(x = "CP045_2020", y = "CP04_2020", color = "A") +
  ggtitle("Scatter Plot of CP04_2020 vs CP045_2020") +
  theme_minimal()

# Print scatter plot
print(scatter_plot_2020)





# Create scatter plot for CP04_2020 and CP045_2020
scatter_plot_2020 <- ggplot(data, aes(x = CP045_2020, y = CP04_2020, color = V)) +
  geom_point() +
  geom_vline(xintercept = 100, linetype = "dashed", color = "red") +
  scale_color_manual(values = color_palette) +
  labs(x = "CP045_2020", y = "CP04_2020", color = "Country") +
  ggtitle("Scatter Plot of CP04_2020 vs CP045_2020") +
  theme_minimal()

# Create normal probability plot
normal_prob_plot <- qqnorm(residuals(lm_model_2020), main = "Normal Probability Plot of Residuals")
qqline(residuals(lm_model_2020))

# Print scatter plot and normal probability plot
print(scatter_plot_2020)
print(normal_prob_plot)




# Load required libraries
library(ggplot2)
library(dplyr)

# Calculate the number of items classified by type "A"
num_items_by_A <- data %>%
  filter(CP04_2020 > 196.3928) %>%
  count(A)

# Sort the counts in descending order and extract the top 5 categories
top_5_categories <- num_items_by_A %>%
  arrange(desc(n)) %>%
  head(5) %>%
  pull(A)

# Create a vector of colors, setting red for the top 5 categories and gray for the rest
slice_colors <- ifelse(num_items_by_A$A %in% top_5_categories, "red", "gray")

# Create a pie chart with custom colors
pie(num_items_by_A$n, labels = num_items_by_A$A, main = "Number of Items by Type A (Above 196.3928)", col = slice_colors)

# Load required libraries
library(ggplot2)
library(dplyr)

# Calculate the number of items classified by country (variable V)
num_items_by_country <- data %>%
  filter(CP04_2020 > 196.3928) %>%
  count(V)

# Sort the counts in descending order and extract the top 5 categories
top_5_countries <- num_items_by_country %>%
  arrange(desc(n)) %>%
  head(5) %>%
  pull(V)

# Create a vector of colors, setting red for the top 5 categories and gray for the rest
slice_colors <- ifelse(num_items_by_country$V %in% top_5_countries, "red", "gray")

# Create a pie chart with custom colors
pie(num_items_by_country$n, labels = num_items_by_country$V, main = "Number of Items by Country (Above 196.3928)", col = slice_colors)

# Load required libraries
library(ggplot2)

# Define color palette
color_palette <- c(
  "LU" = "firebrick",
  "AT" = "red",
  "BE" = "salmon",
  "BG" = "tomato",
  "CY" = "lightblue",
  "CZ" = "skyblue",
  "DE" = "darkblue",
  "DK" = "royalblue",
  "EA" = "deepskyblue",
  "EA12" = "black",
  "EA13" = "gray40",
  "EA17" = "gray60",
  "EA18" = "gray80",
  "EE" = "white",
  "EEA28" = "yellow",
  "EEA30_2007" = "yellow",
  "EFTA" = "darkgreen",
  "EL" = "green",
  "ES" = "limegreen",
  "EU15" = "palegreen",
  "EU25" = "purple",        
  "EU27_2007" = "orange",   
  "EU27_2020" = "orchid",   
  "FI" = "cyan",            
  "FR" = "magenta",         
  "HR" = "violet",          
  "HU" = "brown",           
  "IE" = "tan",             
  "IT" = "khaki",           
  "LT" = "gold",            
  "LV" = "purple",          
  "ME" = "beige",           
  "MK" = "navy",            
  "MT" = "lavender",        
  "NL" = "pink",            
  "NO" = "maroon",          
  "PL" = "coral",           
  "PT" = "azure",           
  "RO" = "ivory",           
  "RS" = "aquamarine",      
  "SE" = "blue",            
  "SI" = "turquoise",       
  "SK" = "pink",            
  "TR" = "salmon",          
  "UK" = "tomato",          
  "XK" = "red"              
)



# Pearson test for CP06_2020 and CP045_2020
pearson_test_2020 <- cor.test(data$CP06_2020, data$CP045_2020, method = "pearson")
# Spearman test for CP06_2020 and CP045_2020
spearman_test_2020 <- cor.test(data$CP06_2020, data$CP045_2020, method = "spearman")

# Linear regression for CP06_2020 and CP045_2020
lm_model_2020 <- lm(data$CP06_2020 ~ data$CP045_2020)

# Predict estimate interval for CP06_2020 when CP045_2020 is 100
prediction_interval <- predict(lm_model_2020, interval = "prediction", newdata = data.frame(CP045_2020 = 100))

# Print correlation coefficients and tests
print(paste("Correlation coefficient (Pearson) for CP06_2020 and CP045_2020:", pearson_test_2020$estimate))
print(paste("Correlation coefficient (Spearman) for CP06_2020 and CP045_2020:", spearman_test_2020$estimate))

# Print Pearson test results
print("Pearson Test Results for CP06_2020 and CP045_2020:")
print(pearson_test_2020)

# Print Spearman test results
print("Spearman Test Results for CP06_2020 and CP045_2020:")
print(spearman_test_2020)

# Print regression summary
print(summary(lm_model_2020))

# Print prediction interval
print("Prediction Interval for CP06_2020 when CP045_2020 = 100:")
print(prediction_interval)

# Fit linear regression model
lm_model <- lm(CP06_2020 ~ CP045_2020, data = data)

# Predict value of "CP06_2020" when "CP045_2020" is 100
predicted_value <- predict(lm_model, newdata = data.frame(CP045_2020 = 100))

# Calculate prediction interval for "CP06_2020" when "CP045_2020" is 100
prediction_interval <- predict(lm_model, newdata = data.frame(CP045_2020 = 100), interval = "prediction", level = 0.95)

# Print predicted value and interval
print("Predicted Value of CP06_2020 when CP045_2020 is 100:")
print(predicted_value)

print("Prediction Interval for CP06_2020 when CP045_2020 is 100 (95% confidence level):")
print(prediction_interval)

# Fit linear regression model
lm_model <- lm(CP06_2020 ~ CP045_2020, data = data)

# Print formula
print("Linear Regression Formula:")
print(summary(lm_model)$call)

# Print R-squared value
print("R-squared:")
print(summary(lm_model)$r.squared)

# Print p-value
print("p-value:")
print(summary(lm_model)$coefficients["CP045_2020", "Pr(>|t|)"])

# Create scatter plot for CP06_2020 and CP045_2020
scatter_plot_2020 <- ggplot(data, aes(x = CP045_2020, y = CP06_2020, color = V)) +
  geom_point() +
  geom_vline(xintercept = 100, linetype = "dashed", color = "red") +
  scale_color_manual(values = color_palette) +
  labs(x = "CP045_2020", y = "CP06_2020", color = "Country") +
  ggtitle("Scatter Plot of CP06_2020 vs CP045_2020") +
  theme_minimal()

# Print scatter plot
print(scatter_plot_2020)


# Load required libraries
library(ggplot2)

# Define color palette
color_palette <- c(
  A_GE3 = "firebrick",
  A_GE3_DCH = "red",
  A2 = "salmon",
  A2_CDH = "tomato",
  A1 = "lightblue",
  A1_DCH = "skyblue",
  DEG1 = "darkblue",
  DEG2 = "royalblue",
  DEG3 = "deepskyblue",
  QUINTILE1 = "black",
  QUINTILE2 = "gray40",
  QUINTILE3 = "gray60",
  QUINTILE4 = "gray80",
  QUINTILE5 = "white",
  UNK = "yellow",
  UNKUNK_Italy = "yellow",
  Y_GE60 = "darkgreen",
  Y45_59 = "green",
  Y30_44 = "limegreen",
  Y_LT30 = "palegreen"
)

# Pearson test for CP06_2020 and CP045_2020
pearson_test_2020 <- cor.test(data$CP06_2020, data$CP045_2020, method = "pearson")
# Spearman test for CP06_2020 and CP045_2020
spearman_test_2020 <- cor.test(data$CP06_2020, data$CP045_2020, method = "spearman")

# Linear regression for CP06_2020 and CP045_2020
lm_model_2020 <- lm(data$CP06_2020 ~ data$CP045_2020)

# Predict estimate interval for CP06_2020 when CP045_2020 is 100
prediction_interval <- predict(lm_model_2020, interval = "prediction", newdata = data.frame(CP045_2020 = 100))

# Print correlation coefficients and tests
print(paste("Correlation coefficient (Pearson) for CP06_2020 and CP045_2020:", pearson_test_2020$estimate))
print(paste("Correlation coefficient (Spearman) for CP06_2020 and CP045_2020:", spearman_test_2020$estimate))

# Print Pearson test results
print("Pearson Test Results for CP06_2020 and CP045_2020:")
print(pearson_test_2020)

# Print Spearman test results
print("Spearman Test Results for CP06_2020 and CP045_2020:")
print(spearman_test_2020)

# Print regression summary
print(summary(lm_model_2020))

# Print prediction interval
print("Prediction Interval for CP06_2020 when CP045_2020 = 100:")
print(prediction_interval)

# Fit linear regression model
lm_model <- lm(CP06_2020 ~ CP045_2020, data = data)

# Predict value of "CP06_2020" when "CP045_2020" is 100
predicted_value <- predict(lm_model, newdata = data.frame(CP045_2020 = 100))

# Calculate prediction interval for "CP06_2020" when "CP045_2020" is 100
prediction_interval <- predict(lm_model, newdata = data.frame(CP045_2020 = 100), interval = "prediction", level = 0.95)

# Print predicted value and interval
print("Predicted Value of CP06_2020 when CP045_2020 is 100:")
print(predicted_value)

print("Prediction Interval for CP06_2020 when CP045_2020 is 100 (95% confidence level):")
print(prediction_interval)

# Fit linear regression model
lm_model <- lm(CP06_2020 ~ CP045_2020, data = data)

# Print formula
print("Linear Regression Formula:")
print(summary(lm_model)$call)

# Print R-squared value
print("R-squared:")
print(summary(lm_model)$r.squared)

# Print p-value
print("p-value:")
print(summary(lm_model)$coefficients["CP045_2020", "Pr(>|t|)"])

# Create scatter plot for CP06_2020 and CP045_2020
scatter_plot_2020 <- ggplot(data, aes(x = CP045_2020, y = CP06_2020, color = A)) +
  geom_point() +
  geom_vline(xintercept = 100, linetype = "dashed", color = "red") +
  scale_color_manual(values = color_palette) +
  labs(x = "CP045_2020", y = "CP06_2020", color = "A") +
  ggtitle("Scatter Plot of CP06_2020 vs CP045_2020") +
  theme_minimal()

# Print scatter plot
print(scatter_plot_2020)





# Create scatter plot for CP06_2020 and CP045_2020
scatter_plot_2020 <- ggplot(data, aes(x = CP045_2020, y = CP06_2020, color = V)) +
  geom_point() +
  geom_vline(xintercept = 100, linetype = "dashed", color = "red") +
  scale_color_manual(values = color_palette) +
  labs(x = "CP045_2020", y = "CP06_2020", color = "Country") +
  ggtitle("Scatter Plot of CP06_2020 vs CP045_2020") +
  theme_minimal()

# Create normal probability plot
normal_prob_plot <- qqnorm(residuals(lm_model_2020), main = "Normal Probability Plot of Residuals")
qqline(residuals(lm_model_2020))

# Print scatter plot and normal probability plot
print(scatter_plot_2020)
print(normal_prob_plot)




# Load required libraries
library(ggplot2)
library(dplyr)

# Calculate the number of items classified by type "A"
num_items_by_A <- data %>%
  filter(CP06_2020 > 43.72451) %>%
  count(A)

# Sort the counts in descending order and extract the top 5 categories
top_5_categories <- num_items_by_A %>%
  arrange(desc(n)) %>%
  head(5) %>%
  pull(A)

# Create a vector of colors, setting red for the top 5 categories and gray for the rest
slice_colors <- ifelse(num_items_by_A$A %in% top_5_categories, "red", "gray")

# Create a pie chart with custom colors
pie(num_items_by_A$n, labels = num_items_by_A$A, main = "Number of Items by Type A (Above 43.72451)", col = slice_colors)

# Load required libraries
library(ggplot2)
library(dplyr)

# Calculate the number of items classified by country (variable V)
num_items_by_country <- data %>%
  filter(CP06_2020 > 43.72451) %>%
  count(V)

# Sort the counts in descending order and extract the top 5 categories
top_5_countries <- num_items_by_country %>%
  arrange(desc(n)) %>%
  head(5) %>%
  pull(V)

# Create a vector of colors, setting red for the top 5 categories and gray for the rest
slice_colors <- ifelse(num_items_by_country$V %in% top_5_countries, "red", "gray")

# Create a pie chart with custom colors
pie(num_items_by_country$n, labels = num_items_by_country$V, main = "Number of Items by Country (Above 43.72451)", col = slice_colors)

# Load required libraries
library(ggplot2)

# Define color palette
color_palette <- c(
  "LU" = "firebrick",
  "AT" = "red",
  "BE" = "salmon",
  "BG" = "tomato",
  "CY" = "lightblue",
  "CZ" = "skyblue",
  "DE" = "darkblue",
  "DK" = "royalblue",
  "EA" = "deepskyblue",
  "EA12" = "black",
  "EA13" = "gray40",
  "EA17" = "gray60",
  "EA18" = "gray80",
  "EE" = "white",
  "EEA28" = "yellow",
  "EEA30_2007" = "yellow",
  "EFTA" = "darkgreen",
  "EL" = "green",
  "ES" = "limegreen",
  "EU15" = "palegreen",
  "EU25" = "purple",        
  "EU27_2007" = "orange",   
  "EU27_2020" = "orchid",   
  "FI" = "cyan",            
  "FR" = "magenta",         
  "HR" = "violet",          
  "HU" = "brown",           
  "IE" = "tan",             
  "IT" = "khaki",           
  "LT" = "gold",            
  "LV" = "purple",          
  "ME" = "beige",           
  "MK" = "navy",            
  "MT" = "lavender",        
  "NL" = "pink",            
  "NO" = "maroon",          
  "PL" = "coral",           
  "PT" = "azure",           
  "RO" = "ivory",           
  "RS" = "aquamarine",      
  "SE" = "blue",            
  "SI" = "turquoise",       
  "SK" = "pink",            
  "TR" = "salmon",          
  "UK" = "tomato",          
  "XK" = "red"              
)



# Pearson test for CP10_2020 and CP045_2020
pearson_test_2020 <- cor.test(data$CP10_2020, data$CP045_2020, method = "pearson")
# Spearman test for CP10_2020 and CP045_2020
spearman_test_2020 <- cor.test(data$CP10_2020, data$CP045_2020, method = "spearman")

# Linear regression for CP10_2020 and CP045_2020
lm_model_2020 <- lm(data$CP10_2020 ~ data$CP045_2020)

# Predict estimate interval for CP10_2020 when CP045_2020 is 100
prediction_interval <- predict(lm_model_2020, interval = "prediction", newdata = data.frame(CP045_2020 = 100))

# Print correlation coefficients and tests
print(paste("Correlation coefficient (Pearson) for CP10_2020 and CP045_2020:", pearson_test_2020$estimate))
print(paste("Correlation coefficient (Spearman) for CP10_2020 and CP045_2020:", spearman_test_2020$estimate))

# Print Pearson test results
print("Pearson Test Results for CP10_2020 and CP045_2020:")
print(pearson_test_2020)

# Print Spearman test results
print("Spearman Test Results for CP10_2020 and CP045_2020:")
print(spearman_test_2020)

# Print regression summary
print(summary(lm_model_2020))

# Print prediction interval
print("Prediction Interval for CP10_2020 when CP045_2020 = 100:")
print(prediction_interval)

# Fit linear regression model
lm_model <- lm(CP10_2020 ~ CP045_2020, data = data)

# Predict value of "CP10_2020" when "CP045_2020" is 100
predicted_value <- predict(lm_model, newdata = data.frame(CP045_2020 = 100))

# Calculate prediction interval for "CP10_2020" when "CP045_2020" is 100
prediction_interval <- predict(lm_model, newdata = data.frame(CP045_2020 = 100), interval = "prediction", level = 0.95)

# Print predicted value and interval
print("Predicted Value of CP10_2020 when CP045_2020 is 100:")
print(predicted_value)

print("Prediction Interval for CP10_2020 when CP045_2020 is 100 (95% confidence level):")
print(prediction_interval)

# Fit linear regression model
lm_model <- lm(CP10_2020 ~ CP045_2020, data = data)

# Print formula
print("Linear Regression Formula:")
print(summary(lm_model)$call)

# Print R-squared value
print("R-squared:")
print(summary(lm_model)$r.squared)

# Print p-value
print("p-value:")
print(summary(lm_model)$coefficients["CP045_2020", "Pr(>|t|)"])

# Create scatter plot for CP10_2020 and CP045_2020
scatter_plot_2020 <- ggplot(data, aes(x = CP045_2020, y = CP10_2020, color = V)) +
  geom_point() +
  geom_vline(xintercept = 100, linetype = "dashed", color = "red") +
  scale_color_manual(values = color_palette) +
  labs(x = "CP045_2020", y = "CP10_2020", color = "Country") +
  ggtitle("Scatter Plot of CP10_2020 vs CP045_2020") +
  theme_minimal()

# Print scatter plot
print(scatter_plot_2020)


# Load required libraries
library(ggplot2)

# Define color palette
color_palette <- c(
  A_GE3 = "firebrick",
  A_GE3_DCH = "red",
  A2 = "salmon",
  A2_CDH = "tomato",
  A1 = "lightblue",
  A1_DCH = "skyblue",
  DEG1 = "darkblue",
  DEG2 = "royalblue",
  DEG3 = "deepskyblue",
  QUINTILE1 = "black",
  QUINTILE2 = "gray40",
  QUINTILE3 = "gray60",
  QUINTILE4 = "gray80",
  QUINTILE5 = "white",
  UNK = "yellow",
  UNKUNK_Italy = "yellow",
  Y_GE60 = "darkgreen",
  Y45_59 = "green",
  Y30_44 = "limegreen",
  Y_LT30 = "palegreen"
)

# Pearson test for CP10_2020 and CP045_2020
pearson_test_2020 <- cor.test(data$CP10_2020, data$CP045_2020, method = "pearson")
# Spearman test for CP10_2020 and CP045_2020
spearman_test_2020 <- cor.test(data$CP10_2020, data$CP045_2020, method = "spearman")

# Linear regression for CP10_2020 and CP045_2020
lm_model_2020 <- lm(data$CP10_2020 ~ data$CP045_2020)

# Predict estimate interval for CP10_2020 when CP045_2020 is 100
prediction_interval <- predict(lm_model_2020, interval = "prediction", newdata = data.frame(CP045_2020 = 100))

# Print correlation coefficients and tests
print(paste("Correlation coefficient (Pearson) for CP10_2020 and CP045_2020:", pearson_test_2020$estimate))
print(paste("Correlation coefficient (Spearman) for CP10_2020 and CP045_2020:", spearman_test_2020$estimate))

# Print Pearson test results
print("Pearson Test Results for CP10_2020 and CP045_2020:")
print(pearson_test_2020)

# Print Spearman test results
print("Spearman Test Results for CP10_2020 and CP045_2020:")
print(spearman_test_2020)

# Print regression summary
print(summary(lm_model_2020))

# Print prediction interval
print("Prediction Interval for CP10_2020 when CP045_2020 = 100:")
print(prediction_interval)

# Fit linear regression model
lm_model <- lm(CP10_2020 ~ CP045_2020, data = data)

# Predict value of "CP10_2020" when "CP045_2020" is 100
predicted_value <- predict(lm_model, newdata = data.frame(CP045_2020 = 100))

# Calculate prediction interval for "CP10_2020" when "CP045_2020" is 100
prediction_interval <- predict(lm_model, newdata = data.frame(CP045_2020 = 100), interval = "prediction", level = 0.95)

# Print predicted value and interval
print("Predicted Value of CP10_2020 when CP045_2020 is 100:")
print(predicted_value)

print("Prediction Interval for CP10_2020 when CP045_2020 is 100 (95% confidence level):")
print(prediction_interval)

# Fit linear regression model
lm_model <- lm(CP10_2020 ~ CP045_2020, data = data)

# Print formula
print("Linear Regression Formula:")
print(summary(lm_model)$call)

# Print R-squared value
print("R-squared:")
print(summary(lm_model)$r.squared)

# Print p-value
print("p-value:")
print(summary(lm_model)$coefficients["CP045_2020", "Pr(>|t|)"])

# Create scatter plot for CP10_2020 and CP045_2020
scatter_plot_2020 <- ggplot(data, aes(x = CP045_2020, y = CP10_2020, color = A)) +
  geom_point() +
  geom_vline(xintercept = 100, linetype = "dashed", color = "red") +
  scale_color_manual(values = color_palette) +
  labs(x = "CP045_2020", y = "CP10_2020", color = "A") +
  ggtitle("Scatter Plot of CP10_2020 vs CP045_2020") +
  theme_minimal()

# Print scatter plot
print(scatter_plot_2020)





# Create scatter plot for CP10_2020 and CP045_2020
scatter_plot_2020 <- ggplot(data, aes(x = CP045_2020, y = CP10_2020, color = V)) +
  geom_point() +
  geom_vline(xintercept = 100, linetype = "dashed", color = "red") +
  scale_color_manual(values = color_palette) +
  labs(x = "CP045_2020", y = "CP10_2020", color = "Country") +
  ggtitle("Scatter Plot of CP10_2020 vs CP045_2020") +
  theme_minimal()

# Create normal probability plot
normal_prob_plot <- qqnorm(residuals(lm_model_2020), main = "Normal Probability Plot of Residuals")
qqline(residuals(lm_model_2020))

# Print scatter plot and normal probability plot
print(scatter_plot_2020)
print(normal_prob_plot)




# Load required libraries
library(ggplot2)
library(dplyr)

# Calculate the number of items classified by type "A"
num_items_by_A <- data %>%
  filter(CP10_2020 < 5.147409) %>%
  count(A)

# Sort the counts in descending order and extract the top 5 categories
top_5_categories <- num_items_by_A %>%
  arrange(desc(n)) %>%
  head(5) %>%
  pull(A)

# Create a vector of colors, setting red for the top 5 categories and gray for the rest
slice_colors <- ifelse(num_items_by_A$A %in% top_5_categories, "red", "gray")

# Create a pie chart with custom colors
pie(num_items_by_A$n, labels = num_items_by_A$A, main = "Number of Items by Type A (Below 5.147409 )", col = slice_colors)

# Load required libraries
library(ggplot2)
library(dplyr)

# Calculate the number of items classified by country (variable V)
num_items_by_country <- data %>%
  filter(CP10_2020 < 5.147409 ) %>%
  count(V)

# Sort the counts in descending order and extract the top 5 categories
top_5_countries <- num_items_by_country %>%
  arrange(desc(n)) %>%
  head(5) %>%
  pull(V)

# Create a vector of colors, setting red for the top 5 categories and gray for the rest
slice_colors <- ifelse(num_items_by_country$V %in% top_5_countries, "red", "gray")

# Create a pie chart with custom colors
pie(num_items_by_country$n, labels = num_items_by_country$V, main = "Number of Items by Country (Below 5.147409 )", col = slice_colors)

















# Load required libraries
library(ggplot2)

# Define color palette
color_palette <- c(
  "LU" = "firebrick",
  "AT" = "red",
  "BE" = "salmon",
  "BG" = "tomato",
  "CY" = "lightblue",
  "CZ" = "skyblue",
  "DE" = "darkblue",
  "DK" = "royalblue",
  "EA" = "deepskyblue",
  "EA12" = "black",
  "EA13" = "gray40",
  "EA17" = "gray60",
  "EA18" = "gray80",
  "EE" = "white",
  "EEA28" = "yellow",
  "EEA30_2007" = "yellow",
  "EFTA" = "darkgreen",
  "EL" = "green",
  "ES" = "limegreen",
  "EU15" = "palegreen",
  "EU25" = "purple",        
  "EU27_2007" = "orange",   
  "EU27_2020" = "orchid",   
  "FI" = "cyan",            
  "FR" = "magenta",         
  "HR" = "violet",          
  "HU" = "brown",           
  "IE" = "tan",             
  "IT" = "khaki",           
  "LT" = "gold",            
  "LV" = "purple",          
  "ME" = "beige",           
  "MK" = "navy",            
  "MT" = "lavender",        
  "NL" = "pink",            
  "NO" = "maroon",          
  "PL" = "coral",           
  "PT" = "azure",           
  "RO" = "ivory",           
  "RS" = "aquamarine",      
  "SE" = "blue",            
  "SI" = "turquoise",       
  "SK" = "pink",            
  "TR" = "salmon",          
  "UK" = "tomato",          
  "XK" = "red"              
)

# Pearson test for CP10_2020 and CP045_2020
pearson_test_2020 <- cor.test(children_subset$CP10_2020, children_subset$CP045_2020, method = "pearson")
# Spearman test for CP10_2020 and CP045_2020
spearman_test_2020 <- cor.test(children_subset$CP10_2020, children_subset$CP045_2020, method = "spearman")

# Linear regression for CP10_2020 and CP045_2020
lm_model_2020 <- lm(children_subset$CP10_2020 ~ children_subset$CP045_2020)

# Predict estimate interval for CP10_2020 when CP045_2020 is 100
prediction_interval <- predict(lm_model_2020, interval = "prediction", newdata = children_subset)

# Print correlation coefficients and tests
print(paste("Correlation coefficient (Pearson) for CP10_2020 and CP045_2020:", pearson_test_2020$estimate))
print(paste("Correlation coefficient (Spearman) for CP10_2020 and CP045_2020:", spearman_test_2020$estimate))

# Print Pearson test results
print("Pearson Test Results for CP10_2020 and CP045_2020:")
print(pearson_test_2020)

# Print Spearman test results
print("Spearman Test Results for CP10_2020 and CP045_2020:")
print(spearman_test_2020)

# Print regression summary
print(summary(lm_model_2020))

# Print prediction interval
print("Prediction Interval for CP10_2020 when CP045_2020 = 100:")
print(prediction_interval)

# Fit linear regression model
lm_model <- lm(CP10_2020 ~ CP045_2020, data = children_subset)

# Predict value of "CP10_2020" when "CP045_2020" is 100
predicted_value <- predict(lm_model, newdata = children_subset)

# Calculate prediction interval for "CP10_2020" when "CP045_2020" is 100
prediction_interval <- predict(lm_model, newdata = children_subset, interval = "prediction", level = 0.95)

# Print predicted value and interval
print("Predicted Value of CP10_2020 when CP045_2020 is 100:")
print(predicted_value)

print("Prediction Interval for CP10_2020 when CP045_2020 is 100 (95% confidence level):")
print(prediction_interval)

# Fit linear regression model
lm_model <- lm(CP10_2020 ~ CP045_2020, data = children_subset)

# Print formula
print("Linear Regression Formula:")
print(summary(lm_model)$call)

# Print R-squared value
print("R-squared:")
print(summary(lm_model)$r.squared)

# Print p-value
print("p-value:")
print(summary(lm_model)$coefficients["CP045_2020", "Pr(>|t|)"])

# Create scatter plot for CP10_2020 and CP045_2020
scatter_plot_2020 <- ggplot(children_subset, aes(x = CP045_2020, y = CP10_2020, color = V)) +
  geom_point() +
  geom_vline(xintercept = 100, linetype = "dashed", color = "red") +
  scale_color_manual(values = color_palette) +
  labs(x = "CP045_2020", y = "CP10_2020", color = "Country") +
  ggtitle("Scatter Plot of CP10_2020 vs CP045_2020") +
  theme_minimal()

# Print scatter plot
print(scatter_plot_2020)


# Load required libraries
library(ggplot2)

# Define color palette
color_palette <- c(
  A_GE3 = "firebrick",
  A_GE3_DCH = "red",
  A2 = "salmon",
  A2_CDH = "tomato",
  A1 = "lightblue",
  A1_DCH = "skyblue",
  DEG1 = "darkblue",
  DEG2 = "royalblue",
  DEG3 = "deepskyblue",
  QUINTILE1 = "black",
  QUINTILE2 = "gray40",
  QUINTILE3 = "gray60",
  QUINTILE4 = "gray80",
  QUINTILE5 = "white",
  UNK = "yellow",
  UNKUNK_Italy = "yellow",
  Y_GE60 = "darkgreen",
  Y45_59 = "green",
  Y30_44 = "limegreen",
  Y_LT30 = "palegreen"
)

# Pearson test for CP10_2020 and CP045_2020
pearson_test_2020 <- cor.test(children_subset$CP10_2020, children_subset$CP045_2020, method = "pearson")
# Spearman test for CP10_2020 and CP045_2020
spearman_test_2020 <- cor.test(children_subset$CP10_2020, children_subset$CP045_2020, method = "spearman")

# Linear regression for CP10_2020 and CP045_2020
lm_model_2020 <- lm(children_subset$CP10_2020 ~ children_subset$CP045_2020)

# Predict estimate interval for CP10_2020 when CP045_2020 is 100
prediction_interval <- predict(lm_model_2020, interval = "prediction", newdata = children_subset)

# Print correlation coefficients and tests
print(paste("Correlation coefficient (Pearson) for CP10_2020 and CP045_2020:", pearson_test_2020$estimate))
print(paste("Correlation coefficient (Spearman) for CP10_2020 and CP045_2020:", spearman_test_2020$estimate))

# Print Pearson test results
print("Pearson Test Results for CP10_2020 and CP045_2020:")
print(pearson_test_2020)

# Print Spearman test results
print("Spearman Test Results for CP10_2020 and CP045_2020:")
print(spearman_test_2020)

# Print regression summary
print(summary(lm_model_2020))

# Print prediction interval
print("Prediction Interval for CP10_2020 when CP045_2020 = 100:")
print(prediction_interval)

# Fit linear regression model
lm_model <- lm(CP10_2020 ~ CP045_2020, data = children_subset)

# Predict value of "CP10_2020" when "CP045_2020" is 100
predicted_value <- predict(lm_model, newdata = children_subset)

# Calculate prediction interval for "CP10_2020" when "CP045_2020" is 100
prediction_interval <- predict(lm_model, newdata = children_subset, interval = "prediction", level = 0.95)

# Print predicted value and interval
print("Predicted Value of CP10_2020 when CP045_2020 is 100:")
print(predicted_value)

print("Prediction Interval for CP10_2020 when CP045_2020 is 100 (95% confidence level):")
print(prediction_interval)

# Fit linear regression model
lm_model <- lm(CP10_2020 ~ CP045_2020, data = children_subset)

# Print formula
print("Linear Regression Formula:")
print(summary(lm_model)$call)

# Print R-squared value
print("R-squared:")
print(summary(lm_model)$r.squared)

# Print p-value
print("p-value:")
print(summary(lm_model)$coefficients["CP045_2020", "Pr(>|t|)"])

# Create scatter plot for CP10_2020 and CP045_2020
scatter_plot_2020 <- ggplot(children_subset, aes(x = CP045_2020, y = CP10_2020, color = A)) +
  geom_point() +
  geom_vline(xintercept = 100, linetype = "dashed", color = "red") +
  scale_color_manual(values = color_palette) +
  labs(x = "CP045_2020", y = "CP10_2020", color = "A") +
  ggtitle("Scatter Plot of CP10_2020 vs CP045_2020") +
  theme_minimal()

# Print scatter plot
print(scatter_plot_2020)





# Create scatter plot for CP10_2020 and CP045_2020
scatter_plot_2020 <- ggplot(children_subset, aes(x = CP045_2020, y = CP10_2020, color = V)) +
  geom_point() +
  geom_vline(xintercept = 100, linetype = "dashed", color = "red") +
  scale_color_manual(values = color_palette) +
  labs(x = "CP045_2020", y = "CP10_2020", color = "Country") +
  ggtitle("Scatter Plot of CP10_2020 vs CP045_2020") +
  theme_minimal()

# Create normal probability plot
normal_prob_plot <- qqnorm(residuals(lm_model_2020), main = "Normal Probability Plot of Residuals")
qqline(residuals(lm_model_2020))

# Print scatter plot and normal probability plot
print(scatter_plot_2020)
print(normal_prob_plot)




# Load required libraries
library(ggplot2)
library(dplyr)

# Calculate the number of items classified by type "A"
num_items_by_A <- children_subset %>%
  filter(CP10_2020 < 6.713799) %>%
  count(A)

# Sort the counts in descending order and extract the top 5 categories
top_5_categories <- num_items_by_A %>%
  arrange(desc(n)) %>%
  head(5) %>%
  pull(A)

# Create a vector of colors, setting red for the top 5 categories and gray for the rest
slice_colors <- ifelse(num_items_by_A$A %in% top_5_categories, "red", "gray")

# Create a pie chart with custom colors
pie(num_items_by_A$n, labels = num_items_by_A$A, main = "Number of Items by Type A (Below 6.713799)", col = slice_colors)

# Load required libraries
library(ggplot2)
library(dplyr)

# Calculate the number of items classified by country (variable V)
num_items_by_country <- children_subset %>%
  filter(CP10_2020 < 6.713799) %>%
  count(V)

# Sort the counts in descending order and extract the top 5 categories
top_5_countries <- num_items_by_country %>%
  arrange(desc(n)) %>%
  head(5) %>%
  pull(V)

# Create a vector of colors, setting red for the top 5 categories and gray for the rest
slice_colors <- ifelse(num_items_by_country$V %in% top_5_countries, "red", "gray")

# Create a pie chart with custom colors
pie(num_items_by_country$n, labels = num_items_by_country$V, main = "Number of Items by Country (Below 6.713799)", col = slice_colors)

# Load required libraries
library(ggplot2)

# Define color palette
color_palette <- c(
  "LU" = "firebrick",
  "AT" = "red",
  "BE" = "salmon",
  "BG" = "tomato",
  "CY" = "lightblue",
  "CZ" = "skyblue",
  "DE" = "darkblue",
  "DK" = "royalblue",
  "EA" = "deepskyblue",
  "EA12" = "black",
  "EA13" = "gray40",
  "EA17" = "gray60",
  "EA18" = "gray80",
  "EE" = "white",
  "EEA28" = "yellow",
  "EEA30_2007" = "yellow",
  "EFTA" = "darkgreen",
  "EL" = "green",
  "ES" = "limegreen",
  "EU15" = "palegreen",
  "EU25" = "purple",        
  "EU27_2007" = "orange",   
  "EU27_2020" = "orchid",   
  "FI" = "cyan",            
  "FR" = "magenta",         
  "HR" = "violet",          
  "HU" = "brown",           
  "IE" = "tan",             
  "IT" = "khaki",           
  "LT" = "gold",            
  "LV" = "purple",          
  "ME" = "beige",           
  "MK" = "navy",            
  "MT" = "lavender",        
  "NL" = "pink",            
  "NO" = "maroon",          
  "PL" = "coral",           
  "PT" = "azure",           
  "RO" = "ivory",           
  "RS" = "aquamarine",      
  "SE" = "blue",            
  "SI" = "turquoise",       
  "SK" = "pink",            
  "TR" = "salmon",          
  "UK" = "tomato",          
  "XK" = "red"              
)



# Pearson test for CP10_BASIC_2020 and CP045_2020
pearson_test_2020 <- cor.test(data$CP10_BASIC_2020, data$CP045_2020, method = "pearson")
# Spearman test for CP10_BASIC_2020 and CP045_2020
spearman_test_2020 <- cor.test(data$CP10_BASIC_2020, data$CP045_2020, method = "spearman")

# Linear regression for CP10_BASIC_2020 and CP045_2020
lm_model_2020 <- lm(data$CP10_BASIC_2020 ~ data$CP045_2020)

# Predict estimate interval for CP10_BASIC_2020 when CP045_2020 is 100
prediction_interval <- predict(lm_model_2020, interval = "prediction", newdata = data.frame(CP045_2020 = 100))

# Print correlation coefficients and tests
print(paste("Correlation coefficient (Pearson) for CP10_BASIC_2020 and CP045_2020:", pearson_test_2020$estimate))
print(paste("Correlation coefficient (Spearman) for CP10_BASIC_2020 and CP045_2020:", spearman_test_2020$estimate))

# Print Pearson test results
print("Pearson Test Results for CP10_BASIC_2020 and CP045_2020:")
print(pearson_test_2020)

# Print Spearman test results
print("Spearman Test Results for CP10_BASIC_2020 and CP045_2020:")
print(spearman_test_2020)

# Print regression summary
print(summary(lm_model_2020))

# Print prediction interval
print("Prediction Interval for CP10_BASIC_2020 when CP045_2020 = 100:")
print(prediction_interval)

# Fit linear regression model
lm_model <- lm(CP10_BASIC_2020 ~ CP045_2020, data = data)

# Predict value of "CP10_BASIC_2020" when "CP045_2020" is 100
predicted_value <- predict(lm_model, newdata = data.frame(CP045_2020 = 100))

# Calculate prediction interval for "CP10_BASIC_2020" when "CP045_2020" is 100
prediction_interval <- predict(lm_model, newdata = data.frame(CP045_2020 = 100), interval = "prediction", level = 0.95)

# Print predicted value and interval
print("Predicted Value of CP10_BASIC_2020 when CP045_2020 is 100:")
print(predicted_value)

print("Prediction Interval for CP10_BASIC_2020 when CP045_2020 is 100 (95% confidence level):")
print(prediction_interval)

# Fit linear regression model
lm_model <- lm(CP10_BASIC_2020 ~ CP045_2020, data = data)

# Print formula
print("Linear Regression Formula:")
print(summary(lm_model)$call)

# Print R-squared value
print("R-squared:")
print(summary(lm_model)$r.squared)

# Print p-value
print("p-value:")
print(summary(lm_model)$coefficients["CP045_2020", "Pr(>|t|)"])

# Create scatter plot for CP10_BASIC_2020 and CP045_2020
scatter_plot_2020 <- ggplot(data, aes(x = CP045_2020, y = CP10_BASIC_2020, color = V)) +
  geom_point() +
  geom_vline(xintercept = 100, linetype = "dashed", color = "red") +
  scale_color_manual(values = color_palette) +
  labs(x = "CP045_2020", y = "CP10_BASIC_2020", color = "Country") +
  ggtitle("Scatter Plot of CP10_BASIC_2020 vs CP045_2020") +
  theme_minimal()

# Print scatter plot
print(scatter_plot_2020)


# Load required libraries
library(ggplot2)

# Define color palette
color_palette <- c(
  A_GE3 = "firebrick",
  A_GE3_DCH = "red",
  A2 = "salmon",
  A2_CDH = "tomato",
  A1 = "lightblue",
  A1_DCH = "skyblue",
  DEG1 = "darkblue",
  DEG2 = "royalblue",
  DEG3 = "deepskyblue",
  QUINTILE1 = "black",
  QUINTILE2 = "gray40",
  QUINTILE3 = "gray60",
  QUINTILE4 = "gray80",
  QUINTILE5 = "white",
  UNK = "yellow",
  UNKUNK_Italy = "yellow",
  Y_GE60 = "darkgreen",
  Y45_59 = "green",
  Y30_44 = "limegreen",
  Y_LT30 = "palegreen"
)

# Pearson test for CP10_BASIC_2020 and CP045_2020
pearson_test_2020 <- cor.test(data$CP10_BASIC_2020, data$CP045_2020, method = "pearson")
# Spearman test for CP10_BASIC_2020 and CP045_2020
spearman_test_2020 <- cor.test(data$CP10_BASIC_2020, data$CP045_2020, method = "spearman")

# Linear regression for CP10_BASIC_2020 and CP045_2020
lm_model_2020 <- lm(data$CP10_BASIC_2020 ~ data$CP045_2020)

# Predict estimate interval for CP10_BASIC_2020 when CP045_2020 is 100
prediction_interval <- predict(lm_model_2020, interval = "prediction", newdata = data.frame(CP045_2020 = 100))

# Print correlation coefficients and tests
print(paste("Correlation coefficient (Pearson) for CP10_BASIC_2020 and CP045_2020:", pearson_test_2020$estimate))
print(paste("Correlation coefficient (Spearman) for CP10_BASIC_2020 and CP045_2020:", spearman_test_2020$estimate))

# Print Pearson test results
print("Pearson Test Results for CP10_BASIC_2020 and CP045_2020:")
print(pearson_test_2020)

# Print Spearman test results
print("Spearman Test Results for CP10_BASIC_2020 and CP045_2020:")
print(spearman_test_2020)

# Print regression summary
print(summary(lm_model_2020))

# Print prediction interval
print("Prediction Interval for CP10_BASIC_2020 when CP045_2020 = 100:")
print(prediction_interval)

# Fit linear regression model
lm_model <- lm(CP10_BASIC_2020 ~ CP045_2020, data = data)

# Predict value of "CP10_BASIC_2020" when "CP045_2020" is 100
predicted_value <- predict(lm_model, newdata = data.frame(CP045_2020 = 100))

# Calculate prediction interval for "CP10_BASIC_2020" when "CP045_2020" is 100
prediction_interval <- predict(lm_model, newdata = data.frame(CP045_2020 = 100), interval = "prediction", level = 0.95)

# Print predicted value and interval
print("Predicted Value of CP10_BASIC_2020 when CP045_2020 is 100:")
print(predicted_value)

print("Prediction Interval for CP10_BASIC_2020 when CP045_2020 is 100 (95% confidence level):")
print(prediction_interval)

# Fit linear regression model
lm_model <- lm(CP10_BASIC_2020 ~ CP045_2020, data = data)

# Print formula
print("Linear Regression Formula:")
print(summary(lm_model)$call)

# Print R-squared value
print("R-squared:")
print(summary(lm_model)$r.squared)

# Print p-value
print("p-value:")
print(summary(lm_model)$coefficients["CP045_2020", "Pr(>|t|)"])

# Create scatter plot for CP10_BASIC_2020 and CP045_2020
scatter_plot_2020 <- ggplot(data, aes(x = CP045_2020, y = CP10_BASIC_2020, color = A)) +
  geom_point() +
  geom_vline(xintercept = 100, linetype = "dashed", color = "red") +
  scale_color_manual(values = color_palette) +
  labs(x = "CP045_2020", y = "CP10_BASIC_2020", color = "A") +
  ggtitle("Scatter Plot of CP10_BASIC_2020 vs CP045_2020") +
  theme_minimal()

# Print scatter plot
print(scatter_plot_2020)





# Create scatter plot for CP10_BASIC_2020 and CP045_2020
scatter_plot_2020 <- ggplot(data, aes(x = CP045_2020, y = CP10_BASIC_2020, color = V)) +
  geom_point() +
  geom_vline(xintercept = 100, linetype = "dashed", color = "red") +
  scale_color_manual(values = color_palette) +
  labs(x = "CP045_2020", y = "CP10_BASIC_2020", color = "Country") +
  ggtitle("Scatter Plot of CP10_BASIC_2020 vs CP045_2020") +
  theme_minimal()

# Create normal probability plot
normal_prob_plot <- qqnorm(residuals(lm_model_2020), main = "Normal Probability Plot of Residuals")
qqline(residuals(lm_model_2020))

# Print scatter plot and normal probability plot
print(scatter_plot_2020)
print(normal_prob_plot)




# Load required libraries
library(ggplot2)
library(dplyr)

# Calculate the number of items classified by type "A"
num_items_by_A <- data %>%
  filter(CP10_BASIC_2020 < 3.128399) %>%
  count(A)

# Sort the counts in descending order and extract the top 5 categories
top_5_categories <- num_items_by_A %>%
  arrange(desc(n)) %>%
  head(5) %>%
  pull(A)

# Create a vector of colors, setting red for the top 5 categories and gray for the rest
slice_colors <- ifelse(num_items_by_A$A %in% top_5_categories, "red", "gray")

# Create a pie chart with custom colors
pie(num_items_by_A$n, labels = num_items_by_A$A, main = "Number of Items by Type A (Below 3.128399)", col = slice_colors)

# Load required libraries
library(ggplot2)
library(dplyr)

# Calculate the number of items classified by country (variable V)
num_items_by_country <- data %>%
  filter(CP10_BASIC_2020 < 3.128399) %>%
  count(V)

# Sort the counts in descending order and extract the top 5 categories
top_5_countries <- num_items_by_country %>%
  arrange(desc(n)) %>%
  head(5) %>%
  pull(V)

# Create a vector of colors, setting red for the top 5 categories and gray for the rest
slice_colors <- ifelse(num_items_by_country$V %in% top_5_countries, "red", "gray")

# Create a pie chart with custom colors
pie(num_items_by_country$n, labels = num_items_by_country$V, main = "Number of Items by Country (Below 3.128399)", col = slice_colors)

# Load required libraries
library(ggplot2)

# Define color palette
color_palette <- c(
  "LU" = "firebrick",
  "AT" = "red",
  "BE" = "salmon",
  "BG" = "tomato",
  "CY" = "lightblue",
  "CZ" = "skyblue",
  "DE" = "darkblue",
  "DK" = "royalblue",
  "EA" = "deepskyblue",
  "EA12" = "black",
  "EA13" = "gray40",
  "EA17" = "gray60",
  "EA18" = "gray80",
  "EE" = "white",
  "EEA28" = "yellow",
  "EEA30_2007" = "yellow",
  "EFTA" = "darkgreen",
  "EL" = "green",
  "ES" = "limegreen",
  "EU15" = "palegreen",
  "EU25" = "purple",        
  "EU27_2007" = "orange",   
  "EU27_2020" = "orchid",   
  "FI" = "cyan",            
  "FR" = "magenta",         
  "HR" = "violet",          
  "HU" = "brown",           
  "IE" = "tan",             
  "IT" = "khaki",           
  "LT" = "gold",            
  "LV" = "purple",          
  "ME" = "beige",           
  "MK" = "navy",            
  "MT" = "lavender",        
  "NL" = "pink",            
  "NO" = "maroon",          
  "PL" = "coral",           
  "PT" = "azure",           
  "RO" = "ivory",           
  "RS" = "aquamarine",      
  "SE" = "blue",            
  "SI" = "turquoise",       
  "SK" = "pink",            
  "TR" = "salmon",          
  "UK" = "tomato",          
  "XK" = "red"              
)



# Pearson test for CP07_transportserv_2020 and CP045_2020
pearson_test_2020 <- cor.test(data$CP07_transportserv_2020, data$CP045_2020, method = "pearson")
# Spearman test for CP07_transportserv_2020 and CP045_2020
spearman_test_2020 <- cor.test(data$CP07_transportserv_2020, data$CP045_2020, method = "spearman")

# Linear regression for CP07_transportserv_2020 and CP045_2020
lm_model_2020 <- lm(data$CP07_transportserv_2020 ~ data$CP045_2020)

# Predict estimate interval for CP07_transportserv_2020 when CP045_2020 is 100
prediction_interval <- predict(lm_model_2020, interval = "prediction", newdata = data.frame(CP045_2020 = 100))

# Print correlation coefficients and tests
print(paste("Correlation coefficient (Pearson) for CP07_transportserv_2020 and CP045_2020:", pearson_test_2020$estimate))
print(paste("Correlation coefficient (Spearman) for CP07_transportserv_2020 and CP045_2020:", spearman_test_2020$estimate))

# Print Pearson test results
print("Pearson Test Results for CP07_transportserv_2020 and CP045_2020:")
print(pearson_test_2020)

# Print Spearman test results
print("Spearman Test Results for CP07_transportserv_2020 and CP045_2020:")
print(spearman_test_2020)

# Print regression summary
print(summary(lm_model_2020))

# Print prediction interval
print("Prediction Interval for CP07_transportserv_2020 when CP045_2020 = 100:")
print(prediction_interval)

# Fit linear regression model
lm_model <- lm(CP07_transportserv_2020 ~ CP045_2020, data = data)

# Predict value of "CP07_transportserv_2020" when "CP045_2020" is 100
predicted_value <- predict(lm_model, newdata = data.frame(CP045_2020 = 100))

# Calculate prediction interval for "CP07_transportserv_2020" when "CP045_2020" is 100
prediction_interval <- predict(lm_model, newdata = data.frame(CP045_2020 = 100), interval = "prediction", level = 0.95)

# Print predicted value and interval
print("Predicted Value of CP07_transportserv_2020 when CP045_2020 is 100:")
print(predicted_value)

print("Prediction Interval for CP07_transportserv_2020 when CP045_2020 is 100 (95% confidence level):")
print(prediction_interval)

# Fit linear regression model
lm_model <- lm(CP07_transportserv_2020 ~ CP045_2020, data = data)

# Print formula
print("Linear Regression Formula:")
print(summary(lm_model)$call)

# Print R-squared value
print("R-squared:")
print(summary(lm_model)$r.squared)

# Print p-value
print("p-value:")
print(summary(lm_model)$coefficients["CP045_2020", "Pr(>|t|)"])

# Create scatter plot for CP07_transportserv_2020 and CP045_2020
scatter_plot_2020 <- ggplot(data, aes(x = CP045_2020, y = CP07_transportserv_2020, color = V)) +
  geom_point() +
  geom_vline(xintercept = 100, linetype = "dashed", color = "red") +
  scale_color_manual(values = color_palette) +
  labs(x = "CP045_2020", y = "CP07_transportserv_2020", color = "Country") +
  ggtitle("Scatter Plot of CP07_transportserv_2020 vs CP045_2020") +
  theme_minimal()

# Print scatter plot
print(scatter_plot_2020)


# Load required libraries
library(ggplot2)

# Define color palette
color_palette <- c(
  A_GE3 = "firebrick",
  A_GE3_DCH = "red",
  A2 = "salmon",
  A2_CDH = "tomato",
  A1 = "lightblue",
  A1_DCH = "skyblue",
  DEG1 = "darkblue",
  DEG2 = "royalblue",
  DEG3 = "deepskyblue",
  QUINTILE1 = "black",
  QUINTILE2 = "gray40",
  QUINTILE3 = "gray60",
  QUINTILE4 = "gray80",
  QUINTILE5 = "white",
  UNK = "yellow",
  UNKUNK_Italy = "yellow",
  Y_GE60 = "darkgreen",
  Y45_59 = "green",
  Y30_44 = "limegreen",
  Y_LT30 = "palegreen"
)

# Pearson test for CP07_transportserv_2020 and CP045_2020
pearson_test_2020 <- cor.test(data$CP07_transportserv_2020, data$CP045_2020, method = "pearson")
# Spearman test for CP07_transportserv_2020 and CP045_2020
spearman_test_2020 <- cor.test(data$CP07_transportserv_2020, data$CP045_2020, method = "spearman")

# Linear regression for CP07_transportserv_2020 and CP045_2020
lm_model_2020 <- lm(data$CP07_transportserv_2020 ~ data$CP045_2020)

# Predict estimate interval for CP07_transportserv_2020 when CP045_2020 is 100
prediction_interval <- predict(lm_model_2020, interval = "prediction", newdata = data.frame(CP045_2020 = 100))

# Print correlation coefficients and tests
print(paste("Correlation coefficient (Pearson) for CP07_transportserv_2020 and CP045_2020:", pearson_test_2020$estimate))
print(paste("Correlation coefficient (Spearman) for CP07_transportserv_2020 and CP045_2020:", spearman_test_2020$estimate))

# Print Pearson test results
print("Pearson Test Results for CP07_transportserv_2020 and CP045_2020:")
print(pearson_test_2020)

# Print Spearman test results
print("Spearman Test Results for CP07_transportserv_2020 and CP045_2020:")
print(spearman_test_2020)

# Print regression summary
print(summary(lm_model_2020))

# Print prediction interval
print("Prediction Interval for CP07_transportserv_2020 when CP045_2020 = 100:")
print(prediction_interval)

# Fit linear regression model
lm_model <- lm(CP07_transportserv_2020 ~ CP045_2020, data = data)

# Predict value of "CP07_transportserv_2020" when "CP045_2020" is 100
predicted_value <- predict(lm_model, newdata = data.frame(CP045_2020 = 100))

# Calculate prediction interval for "CP07_transportserv_2020" when "CP045_2020" is 100
prediction_interval <- predict(lm_model, newdata = data.frame(CP045_2020 = 100), interval = "prediction", level = 0.95)

# Print predicted value and interval
print("Predicted Value of CP07_transportserv_2020 when CP045_2020 is 100:")
print(predicted_value)

print("Prediction Interval for CP07_transportserv_2020 when CP045_2020 is 100 (95% confidence level):")
print(prediction_interval)

# Fit linear regression model
lm_model <- lm(CP07_transportserv_2020 ~ CP045_2020, data = data)

# Print formula
print("Linear Regression Formula:")
print(summary(lm_model)$call)

# Print R-squared value
print("R-squared:")
print(summary(lm_model)$r.squared)

# Print p-value
print("p-value:")
print(summary(lm_model)$coefficients["CP045_2020", "Pr(>|t|)"])

# Create scatter plot for CP07_transportserv_2020 and CP045_2020
scatter_plot_2020 <- ggplot(data, aes(x = CP045_2020, y = CP07_transportserv_2020, color = A)) +
  geom_point() +
  geom_vline(xintercept = 100, linetype = "dashed", color = "red") +
  scale_color_manual(values = color_palette) +
  labs(x = "CP045_2020", y = "CP07_transportserv_2020", color = "A") +
  ggtitle("Scatter Plot of CP07_transportserv_2020 vs CP045_2020") +
  theme_minimal()

# Print scatter plot
print(scatter_plot_2020)





# Create scatter plot for CP07_transportserv_2020 and CP045_2020
scatter_plot_2020 <- ggplot(data, aes(x = CP045_2020, y = CP07_transportserv_2020, color = V)) +
  geom_point() +
  geom_vline(xintercept = 100, linetype = "dashed", color = "red") +
  scale_color_manual(values = color_palette) +
  labs(x = "CP045_2020", y = "CP07_transportserv_2020", color = "Country") +
  ggtitle("Scatter Plot of CP07_transportserv_2020 vs CP045_2020") +
  theme_minimal()

# Create normal probability plot
normal_prob_plot <- qqnorm(residuals(lm_model_2020), main = "Normal Probability Plot of Residuals")
qqline(residuals(lm_model_2020))

# Print scatter plot and normal probability plot
print(scatter_plot_2020)
print(normal_prob_plot)




# Load required libraries
library(ggplot2)
library(dplyr)

# Calculate the number of items classified by type "A"
num_items_by_A <- data %>%
  filter(CP07_transportserv_2020 > 196.3928) %>%
  count(A)

# Sort the counts in descending order and extract the top 5 categories
top_5_categories <- num_items_by_A %>%
  arrange(desc(n)) %>%
  head(5) %>%
  pull(A)

# Create a vector of colors, setting red for the top 5 categories and gray for the rest
slice_colors <- ifelse(num_items_by_A$A %in% top_5_categories, "red", "gray")

# Create a pie chart with custom colors
pie(num_items_by_A$n, labels = num_items_by_A$A, main = "Number of Items by Type A (Above 181.1764)", col = slice_colors)

# Load required libraries
library(ggplot2)
library(dplyr)

# Calculate the number of items classified by country (variable V)
num_items_by_country <- data %>%
  filter(CP07_transportserv_2020 > 196.3928) %>%
  count(V)

# Sort the counts in descending order and extract the top 5 categories
top_5_countries <- num_items_by_country %>%
  arrange(desc(n)) %>%
  head(5) %>%
  pull(V)

# Create a vector of colors, setting red for the top 5 categories and gray for the rest
slice_colors <- ifelse(num_items_by_country$V %in% top_5_countries, "red", "gray")

# Create a pie chart with custom colors
pie(num_items_by_country$n, labels = num_items_by_country$V, main = "Number of Items by Country (Above 181.1764)", col = slice_colors)


# Load required libraries
library(ggplot2)

# Define color palette
color_palette <- c(
  "LU" = "firebrick",
  "AT" = "red",
  "BE" = "salmon",
  "BG" = "tomato",
  "CY" = "lightblue",
  "CZ" = "skyblue",
  "DE" = "darkblue",
  "DK" = "royalblue",
  "EA" = "deepskyblue",
  "EA12" = "black",
  "EA13" = "gray40",
  "EA17" = "gray60",
  "EA18" = "gray80",
  "EE" = "white",
  "EEA28" = "yellow",
  "EEA30_2007" = "yellow",
  "EFTA" = "darkgreen",
  "EL" = "green",
  "ES" = "limegreen",
  "EU15" = "palegreen",
  "EU25" = "purple",        
  "EU27_2007" = "orange",   
  "EU27_2020" = "orchid",   
  "FI" = "cyan",            
  "FR" = "magenta",         
  "HR" = "violet",          
  "HU" = "brown",           
  "IE" = "tan",             
  "IT" = "khaki",           
  "LT" = "gold",            
  "LV" = "purple",          
  "ME" = "beige",           
  "MK" = "navy",            
  "MT" = "lavender",        
  "NL" = "pink",            
  "NO" = "maroon",          
  "PL" = "coral",           
  "PT" = "azure",           
  "RO" = "ivory",           
  "RS" = "aquamarine",      
  "SE" = "blue",            
  "SI" = "turquoise",       
  "SK" = "pink",            
  "TR" = "salmon",          
  "UK" = "tomato",          
  "XK" = "red"              
)



# Define variables to correlate
variables_2020 <- c("basics")

# Pearson test for basics and CP045_2020
pearson_test_2020 <- cor.test(data$basics, data$CP045_2020, method = "pearson")
# Spearman test for basics and CP045_2020
spearman_test_2020 <- cor.test(data$basics, data$CP045_2020, method = "spearman")

# Linear regression for basics and CP045_2020
lm_model_2020 <- lm(data$basics ~ data$CP045_2020)

# Predict estimate interval for basics when CP045_2020 is 100
prediction_interval <- predict(lm_model_2020, interval = "prediction", newdata = data.frame(CP045_2020 = 100))

# Print correlation coefficients and tests
print(paste("Correlation coefficient (Pearson) for basics and CP045_2020:", pearson_test_2020$estimate))
print(paste("Correlation coefficient (Spearman) for basics and CP045_2020:", spearman_test_2020$estimate))

# Print Pearson test results
print("Pearson Test Results for basics and CP045_2020:")
print(pearson_test_2020)

# Print Spearman test results
print("Spearman Test Results for basics and CP045_2020:")
print(spearman_test_2020)

# Print regression summary
print(summary(lm_model_2020))

# Print prediction interval
print("Prediction Interval for basics when CP045_2020 = 100:")
print(prediction_interval)

# Fit linear regression model
lm_model <- lm(basics ~ CP045_2020, data = data)

# Predict value of "basics" when "CP045_2020" is 100
predicted_value <- predict(lm_model, newdata = data.frame(CP045_2020 = 100))

# Calculate prediction interval for "basics" when "CP045_2020" is 100
prediction_interval <- predict(lm_model, newdata = data.frame(CP045_2020 = 100), interval = "prediction", level = 0.95)

# Print predicted value and interval
print("Predicted Value of basics when CP045_2020 is 100:")
print(predicted_value)

print("Prediction Interval for basics when CP045_2020 is 100 (95% confidence level):")
print(prediction_interval)

# Fit linear regression model
lm_model <- lm(basics ~ CP045_2020, data = data)

# Print formula
print("Linear Regression Formula:")
print(summary(lm_model)$call)

# Print R-squared value
print("R-squared:")
print(summary(lm_model)$r.squared)

# Print p-value
print("p-value:")
print(summary(lm_model)$coefficients["CP045_2020", "Pr(>|t|)"])

# Create scatter plot for basics and CP045_2020
scatter_plot_2020 <- ggplot(data, aes(x = CP045_2020, y = basics, color = V)) +
  geom_point() +
  geom_vline(xintercept = 100, linetype = "dashed", color = "red") +
  scale_color_manual(values = color_palette) +
  labs(x = "CP045_2020", y = "basics", color = "Country") +
  ggtitle("Scatter Plot of basics vs CP045_2020") +
  theme_minimal()

# Print scatter plot
print(scatter_plot_2020)


# Load required libraries
library(ggplot2)

# Define color palette
color_palette <- c(
  A_GE3 = "firebrick",
  A_GE3_DCH = "red",
  A2 = "salmon",
  A2_CDH = "tomato",
  A1 = "lightblue",
  A1_DCH = "skyblue",
  DEG1 = "darkblue",
  DEG2 = "royalblue",
  DEG3 = "deepskyblue",
  QUINTILE1 = "black",
  QUINTILE2 = "gray40",
  QUINTILE3 = "gray60",
  QUINTILE4 = "gray80",
  QUINTILE5 = "white",
  UNK = "yellow",
  UNKUNK_Italy = "yellow",
  Y_GE60 = "darkgreen",
  Y45_59 = "green",
  Y30_44 = "limegreen",
  Y_LT30 = "palegreen"
)


# Define variables to correlate
variables_2020 <- c("basics")

# Pearson test for basics and CP045_2020
pearson_test_2020 <- cor.test(data$basics, data$CP045_2020, method = "pearson")
# Spearman test for basics and CP045_2020
spearman_test_2020 <- cor.test(data$basics, data$CP045_2020, method = "spearman")

# Linear regression for basics and CP045_2020
lm_model_2020 <- lm(data$basics ~ data$CP045_2020)

# Predict estimate interval for basics when CP045_2020 is 100
prediction_interval <- predict(lm_model_2020, interval = "prediction", newdata = data.frame(CP045_2020 = 100))

# Print correlation coefficients and tests
print(paste("Correlation coefficient (Pearson) for basics and CP045_2020:", pearson_test_2020$estimate))
print(paste("Correlation coefficient (Spearman) for basics and CP045_2020:", spearman_test_2020$estimate))

# Print Pearson test results
print("Pearson Test Results for basics and CP045_2020:")
print(pearson_test_2020)

# Print Spearman test results
print("Spearman Test Results for basics and CP045_2020:")
print(spearman_test_2020)

# Print regression summary
print(summary(lm_model_2020))

# Print prediction interval
print("Prediction Interval for basics when CP045_2020 = 100:")
print(prediction_interval)

# Fit linear regression model
lm_model <- lm(basics ~ CP045_2020, data = data)

# Predict value of "basics" when "CP045_2020" is 100
predicted_value <- predict(lm_model, newdata = data.frame(CP045_2020 = 100))

# Calculate prediction interval for "basics" when "CP045_2020" is 100
prediction_interval <- predict(lm_model, newdata = data.frame(CP045_2020 = 100), interval = "prediction", level = 0.95)

# Print predicted value and interval
print("Predicted Value of basics when CP045_2020 is 100:")
print(predicted_value)

print("Prediction Interval for basics when CP045_2020 is 100 (95% confidence level):")
print(prediction_interval)

# Fit linear regression model
lm_model <- lm(basics ~ CP045_2020, data = data)

# Print formula
print("Linear Regression Formula:")
print(summary(lm_model)$call)

# Print R-squared value
print("R-squared:")
print(summary(lm_model)$r.squared)

# Print p-value
print("p-value:")
print(summary(lm_model)$coefficients["CP045_2020", "Pr(>|t|)"])

# Create scatter plot for basics and CP045_2020
scatter_plot_2020 <- ggplot(data, aes(x = CP045_2020, y = basics, color = A)) +
  geom_point() +
  geom_vline(xintercept = 100, linetype = "dashed", color = "red") +
  scale_color_manual(values = color_palette) +
  labs(x = "CP045_2020", y = "basics", color = "A") +
  ggtitle("Scatter Plot of basics vs CP045_2020") +
  theme_minimal()

# Print scatter plot
print(scatter_plot_2020)





# Create scatter plot for basics and CP045_2020
scatter_plot_2020 <- ggplot(data, aes(x = CP045_2020, y = basics, color = V)) +
  geom_point() +
  geom_vline(xintercept = 100, linetype = "dashed", color = "red") +
  scale_color_manual(values = color_palette) +
  labs(x = "CP045_2020", y = "basics", color = "Country") +
  ggtitle("Scatter Plot of basics vs CP045_2020") +
  theme_minimal()

# Create normal probability plot
normal_prob_plot <- qqnorm(residuals(lm_model_2020), main = "Normal Probability Plot of Residuals")
qqline(residuals(lm_model_2020))

# Print scatter plot and normal probability plot
print(scatter_plot_2020)
print(normal_prob_plot)

# Load required libraries
library(ggplot2)

# Define color palette
color_palette <- c(
  A_GE3 = "firebrick",
  A_GE3_DCH = "red",
  A2 = "salmon",
  A2_CDH = "tomato",
  A1 = "lightblue",
  A1_DCH = "skyblue",
  DEG1 = "darkblue",
  DEG2 = "royalblue",
  DEG3 = "deepskyblue",
  QUINTILE1 = "black",
  QUINTILE2 = "gray40",
  QUINTILE3 = "gray60",
  QUINTILE4 = "gray80",
  QUINTILE5 = "white",
  UNK = "yellow",
  UNKUNK_Italy = "yellow",
  Y_GE60 = "darkgreen",
  Y45_59 = "green",
  Y30_44 = "limegreen",
  Y_LT30 = "palegreen"
)

# Replace 0 with NA in NDI2020
data$NDI2020[data$NDI2020 == 0] <- NA

# Ensure the columns used for color coding are factors with levels matching the color palette
data$A <- factor(data$A, levels = names(color_palette))
data$V <- factor(data$V, levels = names(color_palette))

# Pearson test for NDI2020 and CP045_2020
pearson_test_2020 <- cor.test(data$NDI2020, data$CP045_2020, method = "pearson")
# Spearman test for NDI2020 and CP045_2020
spearman_test_2020 <- cor.test(data$NDI2020, data$CP045_2020, method = "spearman")

# Linear regression for NDI2020 and CP045_2020
lm_model_2020 <- lm(NDI2020 ~ CP045_2020, data = data)

# Predict estimate interval for NDI2020 when CP045_2020 is 100
prediction_interval <- predict(lm_model_2020, interval = "prediction", newdata = data.frame(CP045_2020 = 100))

# Print correlation coefficients and tests
print(paste("Correlation coefficient (Pearson) for NDI2020 and CP045_2020:", pearson_test_2020$estimate))
print(paste("Correlation coefficient (Spearman) for NDI2020 and CP045_2020:", spearman_test_2020$estimate))

# Print Pearson test results
print("Pearson Test Results for NDI2020 and CP045_2020:")
print(pearson_test_2020)

# Print Spearman test results
print("Spearman Test Results for NDI2020 and CP045_2020:")
print(spearman_test_2020)

# Print regression summary
print(summary(lm_model_2020))

# Print prediction interval
print("Prediction Interval for NDI2020 when CP045_2020 = 100:")
print(prediction_interval)

# Predict value of "NDI2020" when "CP045_2020" is 100
predicted_value <- predict(lm_model_2020, newdata = data.frame(CP045_2020 = 100))

# Calculate prediction interval for "NDI2020" when "CP045_2020" is 100
prediction_interval <- predict(lm_model_2020, newdata = data.frame(CP045_2020 = 100), interval = "prediction", level = 0.95)

# Print predicted value and interval
print("Predicted Value of NDI2020 when CP045_2020 is 100:")
print(predicted_value)

print("Prediction Interval for NDI2020 when CP045_2020 is 100 (95% confidence level):")
print(prediction_interval)

# Print formula
print("Linear Regression Formula:")
print(summary(lm_model_2020)$call)

# Print R-squared value
print("R-squared:")
print(summary(lm_model_2020)$r.squared)

# Print p-value
print("p-value:")
print(summary(lm_model_2020)$coefficients["CP045_2020", "Pr(>|t|)"])

# Create scatter plot for NDI2020 and CP045_2020 with color based on A
scatter_plot_2020_A <- ggplot(data, aes(x = CP045_2020, y = NDI2020, color = A)) +
  geom_point() +
  geom_vline(xintercept = 100, linetype = "dashed", color = "red") +
  scale_color_manual(values = color_palette) +
  labs(x = "CP045_2020", y = "NDI2020", color = "A") +
  ggtitle("Scatter Plot of NDI2020 vs CP045_2020") +
  theme_minimal()

# Print scatter plot
print(scatter_plot_2020_A)

# Create scatter plot for NDI2020 and CP045_2020 with color based on V
scatter_plot_2020_V <- ggplot(data, aes(x = CP045_2020, y = NDI2020, color = V)) +
  geom_point() +
  geom_vline(xintercept = 100, linetype = "dashed", color = "red") +
  scale_color_manual(values = color_palette) +
  labs(x = "CP045_2020", y = "NDI2020", color = "Country") +
  ggtitle("Scatter Plot of NDI2020 vs CP045_2020") +
  theme_minimal()

# Print scatter plot
print(scatter_plot_2020_V)

# Create normal probability plot
qqnorm(residuals(lm_model_2020), main = "Normal Probability Plot of Residuals")
qqline(residuals(lm_model_2020))


# Create histogram for CP045_2020 in the whole database, classified by type "A"
histogram_CP045_2020 <- ggplot(data, aes(x = CP045_2020, fill = A)) +
  geom_histogram(binwidth = 1, color = "black") +
  labs(x = "CP045_2020", y = "Frequency", title = "Histogram of CP045_2020 by Type A") +
  scale_fill_manual(values = color_palette_A) +  # Set color palette
  geom_vline(xintercept = 100, linetype = "dashed", color = "red") +  # Add vertical line at x = 100
  theme_minimal()

# Print histogram
print(histogram_CP045_2020)

# Create boxplot
boxplot_CP045_2020 <- ggplot(data, aes(x = A, y = CP045_2020, fill = A)) +
  geom_boxplot() +
  labs(x = "Household Type", y = "Expenditure in Energy in 2020", title = "Boxplot of Energy Expenditure by Household Type") +
  scale_fill_manual(values = color_palette_A) +  # Set color palette
  geom_hline(yintercept = 100, linetype = "dashed", color = "red") +  # Add horizontal line at y = 100
  theme_minimal()

# Print boxplot
print(boxplot_CP045_2020)


# Create boxplot
boxplot_CP045_2020 <- ggplot(data, aes(x = V, y = CP045_2020, fill = V)) +
  geom_boxplot() +
  labs(x = "Country", y = "Expenditure in Energy in 2020", title = "Boxplot of Energy Expenditure by Country") +
  scale_fill_manual(values = color_palette) +  # Set color palette
  geom_hline(yintercept = 100, linetype = "dashed", color = "red") +  # Add horizontal line at y = 100
  theme_minimal()

# Print boxplot
print(boxplot_CP045_2020)