## 01_preprocess_normalize.R
## Human miRNA filtering + quantile normalisation + PCA
## Input:  GSE106780 series matrix + GPL21572 annotation
## Output: FPHL_final_miRNA_matrix.csv, PCA_coordinates.csv (S14 Table)

library(affy)
library(preprocessCore)

# ---- 1. Read series matrix ----
series_file <- "../data/source/GSE106780_series_matrix.txt.gz"
annot_file  <- "../data/source/GPL21572_annotation.txt"

gse <- read.table(gzfile(series_file), header = TRUE, row.names = 1,
                  sep = "\t", comment.char = "!", check.names = FALSE)
annot <- read.table(annot_file, header = TRUE, sep = "\t",
                    comment.char = "#", fill = TRUE, quote = "")

# ---- 2. Map probe IDs to miRNA names ----
probe_to_mirna <- setNames(annot$miRNA, annot$ID)
rownames(gse) <- probe_to_mirna[rownames(gse)]

# ---- 3. Retain human miRNA rows only ----
is_human_mirna <- grepl("^hsa-", rownames(gse))
gse_human <- gse[is_human_mirna, ]

# ---- 4. Quantile normalisation ----
expr_matrix <- as.matrix(gse_human)
expr_norm <- normalize.quantiles(expr_matrix)
rownames(expr_norm) <- rownames(expr_matrix)
colnames(expr_norm) <- colnames(expr_matrix)

# ---- 5. Write normalised matrix ----
write.csv(expr_norm, "../data/derived/FPHL_final_miRNA_matrix.csv",
          row.names = TRUE)

# ---- 6. PCA ----
samples <- colnames(expr_norm)
group  <- ifelse(grepl("FPHL", samples), "FPHL", "Control")

pca <- prcomp(t(expr_norm), center = TRUE, scale. = FALSE)
pca_coords <- as.data.frame(pca$x[, 1:5])
pca_coords$Sample <- samples
pca_coords$Group  <- group

var_explained <- round(100 * summary(pca)$importance[2, 1:5], 2)
pca_var_df <- data.frame(PC = paste0("PC", 1:5),
                         VarianceExplained = var_explained)

# ---- 7. Write PCA output (S14 Table) ----
write.csv(pca_coords, "../data/derived/PCA_coordinates.csv", row.names = FALSE)

cat(sprintf("Normalised matrix: %d human miRNAs x %d samples\n",
            nrow(expr_norm), ncol(expr_norm)))
cat(sprintf("PC1: %.1f%%, PC2: %.1f%%\n",
            var_explained[1], var_explained[2]))
