# The minimum requirements for successfully running this script are: # 1. Raw EMG data prepared in RData format, one list where each named element is a trial # formatted as data frame with columns named "time" and muscle abbreviations, # file saved as "RAW_EMG.RData" # 2. Gait cycle times prepared in RData format, one list where each named element is a trial # formatted as data frame with columns named "touchdown" and "stance", times in [s], # file saved as "CYCLE_TIMES.RData" # 3. Trials are named as "CYCLE_TIMES_IDxxxx_AA.*_yy" or "RAW_EMG_IDxxxx_AA.*_yy", where: # - IDxxxx is the participant's code (e.g., ID0002 or ID0456) # - AA is the condition (e.g., TW or OR for treadmill or overground walking, but can # as well be longer such as TR_YOUNG_FEMALE, etc.) # - yy is the trial number (e.g., 01 or 155, etc.) # Using regex: CYCLE_TIMES_ID[0-9]*_.*_[0-9]*, RAW_EMG_ID[0-9]*_.*_[0-9]* # Preparation ---- # Install (if needed) required packages pkgs_list <- c("tcltk", "parallel", "progress", "signal", "gtools", "Cairo", "ggplot2", "gridExtra", "benchmarkme", "plyr", "reshape2") pkgs_new <- pkgs_list[!(pkgs_list %in% installed.packages()[, "Package"])] if (length(pkgs_new)) install.packages(pkgs_new) # Load required packages lapply(pkgs_list, library, character.only=T) # ATTENTION! The following line removes all objects from the global environment # except for the ones listed ("cl" in this case) rm(list=setdiff(ls(), c("cl"))) # Where are your files located if not in the same folder as the project's? if (all(file.exists("CYCLE_TIMES.RData", "RAW_EMG.RData"))) { data_path <- getwd() } else { if (.Platform$OS.type=="windows") { data_path <- choose.dir(caption="Select data folder") data_path <- gsub("\\\\", .Platform$file.sep, data_path) } else { data_path <- tcltk::tk_choose.dir(caption="Select data folder") } } # Create "Graphs" folder if it does not exist data_path <- paste0(data_path, .Platform$file.sep) graph_path <- paste0(data_path, "Graphs", .Platform$file.sep) dir.create(graph_path, showWarnings=F) # Create cluster for parallel computing if not already done clusters <- objects() if (sum(grepl("^cl$", clusters))==0) { # Decide how many processor threads have to be excluded from the cluster # It is a good idea to leave at least one free, so that the machine can be used during computation cl <- parallel::makeCluster(max(1, parallel::detectCores()-1)) } # STEP 1 - Raw data processing ---- message("\n################################", "\n STEP 1/4 - Raw data processing", "\n################################") test <- length(list.files(data_path, pattern="^FILT_EMG.RData$")) if (test==1) { qq <- 1 while (!is.na(qq)) { message("\nFiltered EMG data is already present in the specified folder!", "\nDo you want to use this data (type 'y' for 'yes') or recalculate (type 'n' for 'no')?", "\nNOTE: new calculations might require a few minutes, depending on the size of the data set") qq <- readline() # Break if user decides if (qq=="y" || qq=="n") break } } else if (test==0) qq <- "n" if (qq=="n") { # Load raw EMG data and gait cycle times if not already done if (all(!grepl("^CYCLE_TIMES$", objects())) && all(!grepl("^RAW_EMG$", objects()))) { message("\nLoading raw data...") load(paste0(data_path, "CYCLE_TIMES.RData")) load(paste0(data_path, "RAW_EMG.RData")) message("...done!") } # Check for correct naming of trials if (any(!grepl("^RAW_EMG_ID[0-9]+_.+_[0-9]+$", names(RAW_EMG))) || any(!grepl("^CYCLE_TIMES_ID[0-9]+_.+_[0-9]+$", names(CYCLE_TIMES)))) { message("") stop("\nATTENTION: your trials are not named after the guidelines!\nPlease double-check names.") } # Create "Graphs/EMG" folder if it does not exist path_for_graphs <- paste0(graph_path, "EMG", .Platform$file.sep) dir.create(path_for_graphs, showWarnings=F) # Global filter and normalisation parameters, change as needed HPo <- 4 # High-pass filter order HPf <- 50 # High-pass filter frequency [Hz] LPo <- HPo # Low-pass filter order LPf <- 20 # Low-pass filter frequency [Hz] points <- 200 # Gait cycle length (interpolated points) cy_max <- 30 # Max number of cycles to be analysed cycles <- numeric() # To save number of cycles considered # Preallocate to write results FILT_EMG <- vector("list", length(RAW_EMG)) list_names <- character(length=length(RAW_EMG)) # Progress bar pb <- progress::progress_bar$new(format="[:bar]:percent ETA: :eta", total=length(RAW_EMG), clear=F, width=50) message("\nApply filters") for (ii in seq_along(RAW_EMG)) { pb$tick() if (nrow(RAW_EMG[[ii]])<1) { stop("\n\nTrial ", ii, " (", trial, ") is empty! Please check your raw data.") } trial <- gsub("RAW_EMG_", "", names(RAW_EMG[ii])) # Set "time" column name to small letters and muscles to capital names(RAW_EMG[[ii]]) <- toupper(names(RAW_EMG[[ii]])) names(RAW_EMG[[ii]])[grep("TIME", names(RAW_EMG[[ii]]))] <- "time" # Muscle list, including time muscles <- names(RAW_EMG[[ii]]) # Trim to the first cy_max+2 cycles # (+2 because first and last will be trimmed after filtering) c_time <- CYCLE_TIMES[[grep(trial, names(CYCLE_TIMES))]] trim <- c_time$touchdown[cy_max+2] # Check if there are more than cy_max cycles and do not trim if false label <- which(RAW_EMG[[ii]]$time>trim)[1] if (is.na(label)) { emg_data <- RAW_EMG[[ii]] } else { emg_data <- RAW_EMG[[ii]][1:(label-1), ] } # Demean EMG (subtract mean value from the signal to eliminate offset shifts) time <- emg_data$time emg_data <- apply(emg_data, 2, function(x) x-mean(x, na.rm=T)) emg_data[, "time"] <- time # EMG system acquisition frequency [Hz] freq <- round(1/(mean(diff(emg_data[, "time"]), na.rm=T)), 0) # Filtering emg_data_filt <- emg_data emg_data_filt <- emg_data_filt[, colnames(emg_data_filt)!="time"] # High-pass IIR (Infinite Impulse Response) Butterworth zero-phase filter design # Critical frequencies must be between 0 and 1, where 1 is the Nyquist frequency # "filtfilt" is for zero-phase filtering HPfn <- HPf/(freq/2) # Normalise by the Nyquist frequency (f/2) HP <- signal::butter(HPo, HPfn, type="high") emg_data_filt <- apply(emg_data_filt, 2, function(x) signal::filtfilt(HP, x)) # Full-wave rectification emg_data_filt <- abs(emg_data_filt) # Low-pass IIR (Infinite Impulse Response) Butterworth zero-phase filter design # Critical frequencies must be between 0 and 1, where 1 is the Nyquist frequency # "filtfilt" is for zero-phase filtering LPfn <- LPf/(freq/2) # Normalise by the Nyquist frequency (f/2) LP <- signal::butter(LPo, LPfn, type="low") emg_data_filt <- apply(emg_data_filt, 2, function(x) signal::filtfilt(LP, x)) emg_data_filt[emg_data_filt<0] <- 0 # Set negative values to zero temp <- emg_data_filt temp[temp==0] <- Inf emg_data_filt[emg_data_filt==0] <- min(temp) # Set the zeros to the smallest non-zero entry # Subtract the minimum emg_data_filt <- apply(emg_data_filt, 2, function(x) x-min(x)) # Amplitude normalisation to the maximum of the trial emg_data_filt <- apply(emg_data_filt, 2, function(x) x/max(x)) emg_time <- seq(emg_data[, "time"][1], emg_data[, "time"][nrow(emg_data_filt)], 1/freq) # Trim first and last cycle to remove filtering effects c_time <- c_time[2:(nrow(c_time)-1), ] cycs <- nrow(c_time)-1 # Remove excess cycles, if present if (cycs>cy_max) cycs <- cy_max # Isolate cycles and normalise time to "points" points # (first half stance, second half swing) for (jj in 1:cycs) { # Stance temp <- data.frame() t1 <- c_time$touchdown[jj] t2 <- c_time$touchdown[jj]+c_time$stance[jj] if (t1>max(emg_time, na.rm=T) || t2>max(emg_time, na.rm=T)) { cycs <- jj-1 break } else { t1 <- which(emg_time>=t1)[1] t2 <- which(emg_time>=t2)[1] temp <- emg_data_filt[t1:t2, ] } # Check if there is data if (sum(temp, na.rm=T)==0) next # Interpolate each channel to (points/2) points temp1 <- data.frame(time=c(1:(points/2)), apply(temp, 2, function(x) approx(x, method="linear", n=points/2)$y)) # Swing temp <- data.frame() t1 <- c_time$touchdown[jj]+c_time$stance[jj] t2 <- c_time$touchdown[jj+1] if (t1>max(emg_time, na.rm=T) || t2>max(emg_time, na.rm=T)) { cycs <- jj-1 break } else { t1 <- which(emg_time>=t1)[1] t2 <- which(emg_time>=t2)[1] temp <- emg_data_filt[t1:t2, ] } # Check if there is data if (sum(temp, na.rm=T)==0) next # Interpolate each channel to (points/2) points temp2 <- data.frame(time=c(1:(points/2)), apply(temp, 2, function(x) approx(x, method="linear", n=points/2)$y)) temp <- rbind(temp1, temp2) # Set every value >1 to 1 temp[temp>1] <- 1 temp$time <- c(1:points) # For the concatenated data if (jj==1) { emg_data_co <- temp # For the averaged data emg_data_av <- matrix(0, nrow=points, ncol=ncol(emg_data_co)) } else { emg_data_co <- rbind(emg_data_co, temp) # For the averaged data emg_data_av <- emg_data_av+temp } } FILT_EMG[[ii]] <- emg_data_co list_names[ii] <- paste0("FILT_EMG_", trial) # Export average cycles for checking # Normalise the averaged data emg_data_av <- data.frame(apply(emg_data_av, 2, function(x) x/max(x))) emg_data_av[, "time"] <- c(1:points) # Make graphs Cairo::Cairo(file=paste0(path_for_graphs, "temp.png", sep=""), width=1500, height=1500, pointsize=12, dpi=120) varlist <- list() for (mm in 2:length(muscles)) { data <- data.frame(emg_data_av$time, emg_data_av[, grep(paste0("^", muscles[mm], "$"), colnames(emg_data_av))]) colnames(data) <- c("time", "signal") varname <- paste("pp", mm, sep = "") temp <- ggplot2::ggplot() + ggplot2::ggtitle(muscles[mm]) + ggplot2::ylim(0, 1) + ggplot2::geom_line(data=data, ggplot2::aes(x=time, y=signal), colour="black", size=0.9) + ggplot2::theme(axis.title=ggplot2::element_blank(), panel.background=ggplot2::element_rect(fill="white", colour="gray"), panel.grid.major=ggplot2::element_line(colour="gray", size=0.05), panel.grid.minor=ggplot2::element_blank(), legend.position = "none") varlist[[mm-1]] <- assign(varname, temp) } gridExtra::grid.arrange(grobs=varlist, nrow=ceiling(sqrt(length(varlist))), ncol=ceiling(sqrt(length(varlist))), top=(paste0(trial, sep=""))) dev.off() # Close Cairo export file.rename(paste0(path_for_graphs, "temp.png"), paste0(path_for_graphs, "EMG_average_", trial, ".png", sep="")) } names(FILT_EMG) <- list_names message("\nSaving filtered EMG...") save(FILT_EMG, file=paste0(data_path, "FILT_EMG.RData")) message("...done!") } # STEP 2 - Synergies extraction ---- message("\n#################################", "\n STEP 2/4 - Synergies extraction", "\n#################################") test <- length(list.files(data_path, pattern="^SYNS.RData$")) if (test==1) { qq <- 1 while (!is.na(qq)) { message("\nSynergy data is already present in the specified folder!", "\nDo you want to use this data (type 'y' for 'yes') or recalculate (type 'n' for 'no')?", "\nNOTE: new calculations might require a few hours, depending on the size of the data set") qq <- readline() # Break if user decides if (qq=="y" || qq=="n") break } } else if (test==0) qq <- "n" if (qq=="n") { # Load filtered EMG data if not already done if (all(!grepl("^FILT_EMG$", objects()))) { message("\nLoading filtered EMG...") load(paste0(data_path, "FILT_EMG.RData")) message("...done!") } ll <- length(FILT_EMG) # Check for correct naming of trials if (any(!grepl("^FILT_EMG_ID[0-9]+_.+_[0-9]+$", names(FILT_EMG)))) { message("") stop("\nATTENTION: your trials are not named after the guidelines!\nPlease double-check names.") } # Define non-negative matrix factorisation function synsNMFn <- function(V) { R2_target <- 0.01 # Convergence criterion (percent of the R2 value) R2_cross <- numeric() # R2 values for cross validation and syn number assessment M_list <- list() # To save factorisation M matrices (synergies) P_list <- list() # To save factorisation P matrices (primitives) Vr_list <- list() # To save factorisation Vr matrices (reconstructed signals) iters <- numeric() # To save the iterations number # Original matrix time <- V$time V$time <- NULL V <- as.matrix(t(V)) # Needs to be transposed for NMF V[V<0] <- 0 # Set negative values to zero temp <- V temp[temp==0] <- Inf V[V==0] <- min(temp, na.rm=T) # Set the zeros to the smallest non-zero entry in V m <- nrow(V) # Number of muscles n <- ncol(V) # Number of time points max_syns <- m-round(m/4, 0) # Max number of syns for (r in 1:max_syns) { # Run NMF with different initial conditions R2_choice <- numeric() # Collect the R2 values for each syn and choose the max # Preallocate to then choose those with highest R2 M_temp <- list() P_temp <- list() Vr_temp <- list() for (j in 1:5) { # Run NMF 5 times for each syn and choose best run # To save error values R2 <- numeric() # 1st cost function (R squared) SST <- numeric() # Total sum of squares RSS <- numeric() # Residual sum of squares or min reconstruction error # Initialise iterations and define max number of iterations iter <- 1 max_iter <- 1000 # Initialise the two factorisation matrices with random values (uniform distribution) P <- matrix(runif(r*n, min=0.01, max=1), nrow=r, ncol=n) M <- matrix(runif(m*r, min=0.01, max=1), nrow=m, ncol=r) # Iteration zero P <- P*(t(M)%*%V)/(t(M)%*%M%*%P) M <- M*(V%*%t(P))/(M%*%P%*%t(P)) Vr <- M%*%P # Reconstructed matrix RSS <- sum((V-Vr)^2) SST <- sum((V-mean(V))^2) R2[iter] <- 1-(RSS/SST) # l2-norm normalisation which eliminates trivial scale indeterminacies # The cost function doesn't change. Impose ||M||2 = 1 and normalise P accordingly. # ||M||2, also called L2,1 norm or l2-norm, is a sum of Euclidean norm of columns. for (kk in 1:r) { norm <- sqrt(sum(M[, kk]^2)) M[, kk] <- M[, kk]/norm P[kk, ] <- P[kk, ]*norm } # Start iterations for NMF convergence for (iter in iter:max_iter) { P <- P*(t(M)%*%V)/(t(M)%*%M%*%P) M <- M*(V%*%t(P))/(M%*%P%*%t(P)) Vr <- M%*%P RSS <- sum((V-Vr)^2) SST <- sum((V-mean(V))^2) R2[iter] <- 1-(RSS/SST) # l2-norm normalisation for (kk in 1:r) { norm <- sqrt(sum(M[, kk]^2)) M[, kk] <- M[, kk]/norm P[kk, ] <- P[kk, ]*norm } # Check if the increase of R2 in the last 20 iterations is less than the target if (iter>20) { R2_diff <- R2[iter]-R2[iter-20] if (R2_diff1e-04) { iter <- iter+1 if (iter==r-1) { break } R2_interp <- data.frame(synergies=c(1:(r-iter+1)), R2_values=R2_cross[iter:r]) lin <- lm(R2_values~synergies, R2_interp)$fitted.values MSE <- sum((lin-R2_interp$R2_values)^2)/nrow(R2_interp) } syns_R2 <- iter P_choice <- data.frame(time, t(P_list[[syns_R2]])) colnames(P_choice) <- c("time", paste0("Syn", 1:(ncol(P_choice)-1))) rownames(P_choice) <- NULL return(list(synsR2=as.numeric(syns_R2), M=M_list[[syns_R2]], P=P_choice, Vr=Vr_list[[syns_R2]], iterations=as.numeric(iters[syns_R2]), R2=as.numeric(R2_cross[syns_R2]))) } # Get date and time for display before starting computation date <- gsub(" [0-9][0-9]:[0-9][0-9]:[0-9][0-9]$", "", Sys.time()) time <- gsub("^[0-9][0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9] ", "", Sys.time()) message("\nExtract synergies\nStarted on ", date, " at ", time) tictoc <- system.time({ # "synsNMFn" is the core function for extracting synergies # and here it is applied in parallel to speed up computation # At the moment there is no progress bar for parLapply SYNS <- parallel::parLapply(cl, FILT_EMG, synsNMFn) }) names(SYNS) <- gsub("FILT_EMG", "SYNS", names(SYNS)) message("- - - - - - - - - - - - - - - - - - - - - - -\n", benchmarkme::get_sys_details()$data, "\n", benchmarkme::get_sys_details()$sys_info$sysname, " ", benchmarkme::get_sys_details()$sys_info$release, " ", benchmarkme::get_sys_details()$sys_info$version, "\n", benchmarkme::get_sys_details()$cpu$model_name, " ", "\n", R.version$version.string, "\n", parallel::detectCores(logical=F), " cores, ", parallel::detectCores(), " logical, ", length(cl), " used", "\n\n Number of trials: ", ll, "\n Computation time: ", round(tictoc[[3]], 0), " s", "\nAverage trial comp. time: ", round(tictoc[[3]]/ll, 2), " s\n", "\n- - - - - - - - - - - - - - - - - - - - - - -") message("\nSaving synergies...") save(SYNS, file=paste0(data_path, "SYNS.RData")) message("...done!") } # STEP 3 - Classification of muscle synergies ---- message("\n###############################################", "\n STEP 3/4 - Classification of muscle synergies", "\n###############################################") test <- length(list.files(data_path, pattern="^SYNS_classified.RData$")) if (test==1) { qq <- 1 while (!is.na(qq)) { message("\nClassified synergy data is already present in the specified folder!", "\nDo you want to use this data (type 'y' for 'yes') or recalculate (type 'n' for 'no')?", "\nNOTE: new calculations might require a few minutes, depending on the size of the data set") qq <- readline() # Break if user decides if (qq=="n") { # Prompt for classification method ww <- 1 while (!is.na(ww)) { message("\nPlease choose a classification method: k-means (type 'k') or NMF (type 'n')", "\nNOTE: k-means is faster but NMF gives similar results") ww <- readline() # Break if user decides if (ww=="k" || ww=="n") break } break } else if (qq=="y") break } } else if (test==0) { qq <- "n" # Prompt for classification method ww <- 1 while (!is.na(ww)) { message("\nPlease choose a classification method: k-means (type 'k') or NMF (type 'n')", "\nNOTE: k-means is faster but NMF gives similar results") ww <- readline() # Break if user decides if (ww=="k" || ww=="n") break } } # Load data and define common functions if (qq=="n") { # Load muscle synergies if not already done if (all(!grepl("^SYNS$", objects()))) { load(paste0(data_path, "SYNS.RData")) } # Check for correct naming of trials if (any(!grepl("^SYNS_ID[0-9]+_.+_[0-9]+$", names(SYNS)))) { message("") stop("\nATTENTION: your trials are not named after the guidelines!\nPlease double-check names.") } # Get concatenated primitives SYNS_P <- lapply(SYNS, function(x) x$P) # Get motor modules SYNS_M <- lapply(SYNS, function(x) x$M) # Make sure that all motor primitives are normalised to the same amount of points points <- unlist(lapply(SYNS_P, function(x) max(x$time))) if (sd(points)!=0) { message("\nNot all motor primitives are normalised to the same amount of points!", "\nPlease re-check your data\n") } else points <- unique(points) # Find number of muscles muscle_num <- unique(unlist(lapply(SYNS_M, function(x) nrow(x)))) if (length(muscle_num)!=1) { message("Not all trials have the same number of muscles!!!") break } message("\nCalculating mean gait cycles...") # Progress bar pb <- progress::progress_bar$new(format="[:bar]:percent ETA: :eta", total=length(SYNS_P), clear=F, width=50) SYNS_P <- lapply(SYNS_P, function(x) { pb$tick() x$time <- NULL temp <- matrix(0, nrow=points, ncol=ncol(x)) for (cc in seq(1, (1+nrow(x)-points), points)) { temp <- temp+x[c(cc:(cc+points-1)), ] } # Divide by the number of cycles to get mean value temp <- temp/(nrow(x)/points) # Amplitude normalisation x <- apply(temp, 2, function(y) y/(max(y))) # Transpose to facilitate visualisation return(t(x)) }) message("...done!") message("\nPutting primitives into a single data frame...") # Progress bar pb <- progress::progress_bar$new(format="[:bar]:percent ETA: :eta", total=length(SYNS_P), clear=F, width=50) data_P <- plyr::ldply(SYNS_P, function(x) { pb$tick() data.frame(x) }) message("...done!") message("\nPutting modules into a single data frame...") # Progress bar pb <- progress::progress_bar$new(format="[:bar]:percent ETA: :eta", total=length(SYNS_M), clear=F, width=50) data_M <- plyr::ldply(SYNS_M, function(x) { pb$tick() t(data.frame(x)) }) message("...done!") # Check if names are the same for primitives and modules if (identical(data_P$.id, data_M$.id)) { trials <- data_M$.id } else { message("The names of primitives and modules are not the same!!!") break } # Give names to trials (start from synergy zero because # the function "make.unique" works like that) # Find non-duplicated names and assign "Syn0" to them syn0 <- which(!duplicated(trials)) trials <- make.unique(trials) trials[syn0] <- paste0(trials[syn0], "_Syn0") # Assign incremental Syn number to other names trials <- gsub("\\.", "_Syn", trials) # Start from Syn1 instead that from Syn0 temp1 <- gsub("[0-9]$", "", trials) temp2 <- as.numeric(gsub(".*_Syn", "", trials))+1 trials <- paste0(temp1, temp2) # Assign new names to row names and remove id column rownames(data_P) <- trials rownames(data_M) <- trials data_P$.id <- NULL data_M$.id <- NULL # Filter primitives to improve classification data_P <- t(apply(data_P, 1, function(x) { # Build filter LP <- signal::butter(4, 10/(points/2), type="low") # Apply filter x <- signal::filtfilt(LP, t(x)) # Remove negative entries x[x<0] <- 0 # Subtract the minimum x <- x-min(x) # Set zeroes to smallest non-negative entry temp <- x temp[temp==0] <- Inf x[x==0] <- min(temp, na.rm=T) # Normalise to maximum x <- x/max(x) return(x) })) # Define centre of activity (CoA) CoA <- function(x) { points <- length(x) AA <- numeric() BB <- numeric() for (pp in 1:points) { alpha <- 360*(pp-1)/(points-1)*pi/180 vec <- x[pp] AA[pp] <- vec*cos(alpha) BB[pp] <- vec*sin(alpha) } AA <- sum(AA) BB <- sum(BB) CoAt <- atan(BB/AA)*180/pi # To keep the sign if (AA>0 && BB>0) CoAt <- CoAt if (AA<0 && BB>0) CoAt <- CoAt+180 if (AA<0 && BB<0) CoAt <- CoAt+180 if (AA>0 && BB<0) CoAt <- CoAt+360 CoAt*points/360 } # Define NMF function NMFn <- function(V) { R2_target <- 0.01 # Convergence criterion (percent of the R2 value) R2_cross <- numeric() # R2 values for cross validation and syn number assessment M_list <- list() # To save factorisation M matrices (synergies) P_list <- list() # To save factorisation P matrices (primitives) # Original matrix V <- as.matrix(V) V[V<0] <- 0 # Set negative values to zero temp <- V temp[temp==0] <- Inf V[V==0] <- min(temp, na.rm=T) # Set the zeros to the smallest non-zero entry in V m <- nrow(V) # Number of primitives in the data-set n <- ncol(V) # Number of time points # Determine the maximum number of synergies by searching for the maximum rank temp <- as.numeric(gsub(".*\\_Syn", "", rownames(V))) # Add one because interpolation must happen with at least two points max_syns <- max(temp)+1 for (r in 1:max_syns) { # Run NMF with different initial conditions R2_choice <- numeric() # Collect the R2 values for each syn and choose the max # Preallocate to then choose those with highest R2 M_temp <- list() P_temp <- list() for (j in 1:5) { # Run NMF 5 times for each syn and choose best run # To save error values R2 <- numeric() # 1st cost function (R squared) SST <- numeric() # Total sum of squares RSS <- numeric() # Residual sum of squares or min reconstruction error # Initialise iterations and define max number of iterations iter <- 1 max_iter <- 1000 # Initialise the two factorisation matrices with random values # (uniform distribution) P <- matrix(runif(r*n, min=0.01, max=1), nrow=r, ncol=n) M <- matrix(runif(m*r, min=0.01, max=1), nrow=m, ncol=r) # Iteration zero P <- P*(t(M)%*%V)/(t(M)%*%M%*%P) M <- M*(V%*%t(P))/(M%*%P%*%t(P)) Vr <- M%*%P # Reconstructed matrix RSS <- sum((V-Vr)^2) SST <- sum((V-mean(V))^2) R2[iter] <- 1-(RSS/SST) # l2-norm normalisation which eliminates trivial scale indeterminacies for (kk in 1:r) { norm <- sqrt(sum(M[, kk]^2)) M[, kk] <- M[, kk]/norm P[kk, ] <- P[kk, ]*norm } # Start iterations for NMF convergence for (iter in iter:max_iter) { P <- P*(t(M)%*%V)/(t(M)%*%M%*%P) M <- M*(V%*%t(P))/(M%*%P%*%t(P)) Vr <- M%*%P RSS <- sum((V-Vr)^2) SST <- sum((V-mean(V))^2) R2[iter] <- 1-(RSS/SST) # l2-norm normalisation for (kk in 1:r) { norm <- sqrt(sum(M[, kk]^2)) M[, kk] <- M[, kk]/norm P[kk, ] <- P[kk, ]*norm } # Check if the increase of R2 in the last 20 iterations is less than the target if (iter>20) { R2_diff <- R2[iter]-R2[iter-20] if (R2_diff1e-04) { iter <- iter+1 if (iter==r-1) { break } R2_interp <- data.frame(synergies=c(1:(r-iter+1)), R2_values=R2_cross[iter:r]) lin <- lm(R2_values~synergies, R2_interp)$fitted.values MSE <- sum((lin-R2_interp$R2_values)^2)/nrow(R2_interp) } syns_R2 <- iter return(list(M=M_list[[syns_R2]], P=P_list[[syns_R2]])) } # Find different conditions # (trials must be named as stated in the beginning of this script) # "*": 0 or more # "+": 1 or more conditions <- gsub("SYNS_ID[0-9]+_", "", rownames(data_M)) conditions <- unique(gsub("_Syn.*", "", conditions)) conditions <- unique(gsub("_[0-9]+$", "", conditions)) # Build list with the trials of the different conditions all_P <- vector("list", length(conditions)) names(all_P) <- conditions all_M <- all_P for (condition in conditions) { all_P[[condition]] <- data_P[grep(paste0("_", condition, "_"), rownames(data_P)), ] all_M[[condition]] <- as.matrix(data_M[grep(paste0("_", condition, "_"), rownames(data_M)), ]) } } if (qq=="n" && ww=="k") { # Unsupervised learning method to classify synergies based on k-means message("\nClustering motor primitives with k-means...") # Progress bar pb <- progress::progress_bar$new(format="[:bar]:percent ETA: :eta", total=length(all_P), clear=F, width=50) # Apply k-means to motor primitives # par(mfrow=c(2, 1)) clust_P <- lapply(all_P, function(x) { pb$tick() # Determine number of clusters by computing the within-group sum of squares # for an increasing number of clusters and then searching for an elbow in the # clusters vs. withinss curve # nstart is set >1 due to instabilities found (of course this slows down computation) kmeans_all <- list() for (clust in 1:muscle_num) { kmeans_all[[clust]] <- kmeans(x, centers=clust, nstart=20, algorithm="Hartigan-Wong") } withinss <- unlist(lapply(kmeans_all, function(y) sum(y$withinss))) withinss <- withinss-min(withinss) withinss <- withinss/max(withinss) # plot(x=c(1:muscle_num), y=withinss, # type="b", # xlab="Number of clusters", ylab="Within groups sum of squares") # Find the elbow in the clusters vs. withinss curve MSE <- 100 # Initialise the Mean Squared Error (MSE) iter <- 0 # Initialise iterations while (MSE>1e-03) { iter <- iter+1 if (iter==muscle_num-1) { break } withinss_interp <- data.frame(xx=c(1:(muscle_num-iter+1)), yy=withinss[iter:(muscle_num)]) # plot(x=withinss_interp$xx+iter, # y=withinss_interp$yy, # xlim=c(1, muscle_num), # ylim=c(0, 1), # ty="b") linear <- lm(yy~xx, withinss_interp)$fitted.values MSE <- sum((linear-withinss_interp$yy)^2)/nrow(withinss_interp) } clust <- iter # abline(v=clust, col=2, lwd=2) kmeans_all[[clust]] }) message("...done!") # Write number of clusters per condition in a simple way clust_num_P <- unlist(lapply(clust_P, function(x) max(x$cluster))) # Apply k-means to motor modules message("\nClustering motor modules with k-means...") # Progress bar pb <- progress::progress_bar$new(format="[:bar]:percent ETA: :eta", total=length(all_M), clear=F, width=50) clust_M <- clust_P for (cond in conditions) { pb$tick() # Use previously-determined number of clusters clust_M[[cond]] <- kmeans(all_M[[cond]], centers=clust_num_P[cond], nstart=20, algorithm="Hartigan-Wong") } message("...done!") order_list <- list() # Order synergies based on k-means clustering for (cond in conditions) { orders <- data.frame(clusters_P=clust_P[[cond]]$cluster, clusters_M=clust_M[[cond]]$cluster, FWHM=apply(all_P[[cond]], 1, function(x) length(which(x>=0.5))), CoA=apply(all_P[[cond]], 1, function(x) CoA(x))) clust_num <- clust_num_P[cond] # Arrange primitive- and module-based clusters in the same order temp_P <- subset(orders, select=-c(clusters_M)) temp_M <- subset(orders, select=-c(clusters_P)) # Take average FWHM and CoA based on cluster geoms_P <- data.frame(aggregate(FWHM~clusters_P, temp_P, mean), CoA=aggregate(CoA~clusters_P, temp_P, mean)$CoA) geoms_M <- data.frame(aggregate(FWHM~clusters_M, temp_M, mean), CoA=aggregate(CoA~clusters_M, temp_M, mean)$CoA) # Define score as sum of FWHM and CoA and normalise to number of points geoms_P <- data.frame(clust_P=geoms_P$clusters_P, score=(geoms_P$FWHM+geoms_P$CoA)/(2*points)) geoms_M <- data.frame(clust_M=geoms_M$clusters_M, score=(geoms_M$FWHM+geoms_M$CoA)/(2*points)) # Calculate mutual score squared residuals and find minimum perms <- gtools::permutations(nrow(geoms_P), r=2, repeats.allowed=T) resids <- numeric() for (perm in 1:nrow(perms)) { resids[perm] <- (geoms_P$score[perms[perm, 1]]-geoms_M$score[perms[perm, 2]])^2 } perms <- perms[sort(resids, decreasing=F, index.return=T)$ix, ] perms <- data.frame(perms[-which(duplicated(perms[, 1])), ]) colnames(perms) <- c("old", "new") # # Save first plots # ggP <- ggplot(data=orders, aes(x=FWHM, y=CoA, colour=factor(clusters_P))) + # geom_point(size=2) + # xlim(0, points) + # ylim(0, points) + # ggtitle(paste0(cond, " (primitive clustering)")) + # theme(legend.title=element_blank()) # # ggM <- ggplot(data=orders, aes(x=FWHM, y=CoA, colour=factor(clusters_M))) + # geom_point(size=2) + # xlim(0, points) + # ylim(0, points) + # ggtitle(paste0(cond, " (module clustering)")) + # theme(legend.title=element_blank()) # # gridExtra::grid.arrange(grobs=list(ggP, ggM), nrow=2, ncol=1) if (all(!duplicated(perms$old)) && all(!duplicated(perms$new))) { if (!identical(perms$old, perms$new)) { temp_M <- orders$clusters_M orders$clusters_M <- c(perms$old, temp_M)[match(temp_M, c(perms$new, temp_M))] } # Find discordant classifications and label as "combined" discordant <- which(orders$clusters_P!=orders$clusters_M) # But first check if all clusters are there clust_test <- orders$clusters_P clust_test <- unique(clust_test[-discordant]) if (length(clust_test)clust_num){ pp <- -1 } } orders_new[cc] <- pp } orders_new <- as.numeric(orders_new) orders_new <- sort.int(orders_new, index.return=T)$ix # Re-create ordering rule order_rule <- data.frame(old=c(1:clust_num), new=orders_new) # Apply new order to mean curves mean_P <- mean_P[order_rule$new, ] mean_M <- mean_M[order_rule$new, ] rownames(mean_P) <- paste0("Syn", 1:nrow(mean_P)) rownames(mean_M) <- paste0("Syn", 1:nrow(mean_M)) # Apply new order to all temp_P <- orders$clusters_P temp_M <- orders$clusters_M orders$clusters_P <- c(order_rule$old, temp_P)[match(temp_P, c(order_rule$new, temp_P))] orders$clusters_M <- c(order_rule$old, temp_M)[match(temp_M, c(order_rule$new, temp_M))] # Re-plot classified syns par(mfrow=c(clust_num, 2), mai=c(dev_size[2]/red_factor, dev_size[1]/red_factor, dev_size[2]/red_factor, dev_size[1]/red_factor)) for (syn in 1:clust_num) { # Plot motor modules barplot(mean_M[syn, ]) # Plot motor primitives plot(x=c(1:ncol(mean_P)), y=mean_P[syn, ], ty="l", main=paste0("Synergy ", syn, ", ", cond), xlab="", ylab="", xaxt="n", yaxt="n", lwd=2) } Sys.sleep(2) } # Remove double classifications, if present participants <- gsub("SYNS_", "", rownames(orders)) participants <- unique(gsub("_.*", "", participants)) for (participant in participants) { trial <- orders[grep(paste0(participant, "_"), rownames(orders)), ] trial$clusters_M <- NULL # Find duplicates dupl <- which(duplicated(trial$clusters_P)) if (length(dupl)>0) { for (syn in c(1:clust_num)) { dupl <- grep(paste0("^", syn, "$"), trial$clusters_P) if (length(dupl)<=1) { next } else { R2 <- numeric() P2 <- as.numeric(mean_P[syn, ]) for (dd in c(1:length(dupl))) { trial_syn <- rownames(trial)[dupl[dd]] # Calculate R2 between each duplicated primitive and the mean P1 <- as.numeric(data_P[grep(trial_syn, rownames(data_P)), ]) RSS <- sum((P1-P2)^2) SST <- sum((P1-mean(P1))^2) R2[dd] <- 1-(RSS/SST) } dupl <- dupl[-which.max(R2)] trial$clusters_P[dupl] <- "combined" } } } if (participant==participants[1]) { orders_new <- trial } else orders_new <- rbind(orders_new, trial) } # # Make last plots # ggfinal <- ggplot(data=orders_new, aes(x=FWHM, y=CoA, # colour=factor(clusters_P))) # # ggP <- ggP + geom_point(size=2) + # xlim(0, points) + # ylim(0, points) + # ggtitle(paste0(cond, " (primitive clustering)")) + # theme(legend.title=element_blank()) # # ggM <- ggM + geom_point(size=2) + # xlim(0, points) + # ylim(0, points) + # ggtitle(paste0(cond, " (module clustering)")) + # theme(legend.title=element_blank()) # # ggfinal <- ggfinal + geom_point(size=2) + # xlim(0, points) + # ylim(0, points) + # ggtitle(paste0(cond, " (final clustering)")) + # theme(legend.title=element_blank()) # # # gridExtra::grid.arrange(grobs=list(ggP, ggM, ggfinal), nrow=3, ncol=1) trial <- gsub("_Syn[0-9]*$", "", rownames(orders_new)) new <- paste0(trial, "_Syn", orders_new$clusters_P) orders_new <- data.frame(old=rownames(orders_new), new, syn_classified=gsub(".*_Syn", "", new), trial) order_list[[cond]] <- orders_new combined <- length(grep("Syncombined", orders_new$new)) total <- nrow(orders_new) message("\n Locomotion type: ", cond, "\n Total synergies: ", total, "\n Recognised: ", total-combined, "\n Combined: ", combined, " (~", round(combined/total*100, 0), "%)", "\n Clusters: ", clust_num, "\n") syn_perc <- numeric() for (ss in 1:clust_num) { syn_perc[ss] <- round(length(grep(paste0("^", ss, "$"), orders_new$syn_classified))/length(participants)*100, 0) message(" Syn", ss, ": ", syn_perc[ss], "%") } } orders <- plyr::ldply(order_list, data.frame) # Rename synergies in the correct order and save SYNS_classified <- SYNS # Find unique trial names trials <- which(!duplicated(orders$trial)) for (uu in seq_along(trials)) { # Read the classification if (uusyns_num_n){ pp <- -1 } } orders_new[cc] <- pp } orders_new <- as.numeric(orders_new) orders <- sort.int(orders_new, index.return=T)$ix data_NMF_P <- data_NMF_P[orders, ] data_NMF_M <- data_NMF_M[, orders] # Normalise to 1 data_NMF_P <- data.frame(t(apply(data_NMF_P, 1, function(x) x/max(x)))) data_NMF_M <- data.frame(apply(data_NMF_M, 2, function(x) x/max(x))) rownames(data_NMF_P) <- paste0("Syn", 1:nrow(data_NMF_P)) colnames(data_NMF_M) <- paste0("Syn", 1:ncol(data_NMF_M)) # Re-plot classified syns par(mfrow=c(syns_num_n, 2), mai=c(dev_size[2]/red_factor, dev_size[1]/red_factor, dev_size[2]/red_factor, dev_size[1]/red_factor)) for (syn in 1:syns_num_n) { plot(x=c(1:ncol(data_NMF_P)), y=data_NMF_P[syn, ], ty="l", main=paste0("Synergy ", syn, ", ", cond), xlab="", ylab="", xaxt="n", yaxt="n", lwd=2) barplot(sort(data_NMF_M[, syn], decreasing=T)) abline(h=seq(0.2, 0.8, 0.1), col=2) tot <- length(data_NMF_M[, syn]) abline(v=tot*seq(0, 1, 0.25), col=2) } Sys.sleep(2) } # Now search for syns having module bigger than "M_threshold" # If a syn has more than one module, choose the one with the highest value # Then compare the primitive to the relevant one and assess similarity # If similarity is lower than "R2_threshold", classify as combined, otherwise keep temp <- data_NMF_M # Determine the threshold for M and R2 # Calculate the M threshold M_threshold <- mean(colMeans(temp, na.rm=T), na.rm=T) temp[temp=1 || sum(temp[tt, ], na.rm=T)!=0) { # Find position of maximum choice <- which(temp[tt, ]==max(temp[tt, ], na.rm=T)) # Discard the others temp[tt, -choice] <- NA # Calculate R2 between curve and primitive P1 <- as.numeric(data_NMF_P[choice, ]) P2 <- as.numeric(data_P[tt, ]) RSS <- sum((P1-P2)^2) SST <- sum((P1-mean(P1))^2) R2 <- 1-(RSS/SST) quality[tt] <- R2 } } # Calculate the R2 threshold R2_threshold <- mean(quality, na.rm=T) if (sign(R2_threshold)==1) { R2_threshold <- R2_threshold/4 } else if (sign(R2_threshold)==-1) { R2_threshold <- R2_threshold*4 } quality[quality<=R2_threshold] <- NA classification <- numeric() weight <- numeric() for (tt in 1:nrow(temp)) { if (sum(temp[tt, ], na.rm=T)==0) { classification[tt] <- NA weight[tt] <- NA next } else if (sum(is.na(temp[tt, ]))>=1 || sum(temp[tt, ], na.rm=T)!=0) { # Find position of maximum choice <- which(temp[tt, ]==max(temp[tt, ], na.rm=T)) # Discard the others temp[tt, -choice] <- NA # Discard if R2 between curve and primitive is lower than R2_threshold R2 <- quality[tt] if (is.na(R2)) { classification[tt] <- NA weight[tt] <- NA } else { classification[tt] <- choice weight[tt] <- temp[tt, choice] } } } trial <- gsub("_Syn.*", "", rownames(temp)) syn <- as.numeric(gsub(".*_Syn", "", rownames(temp))) ordered <- data.frame(trial, syn, classification, weight, quality) colnames(ordered) <- c("trial", "syn_original", "syn_classified", "weight", "quality") syns_num <- max(ordered$syn_classified, na.rm=T) ordered$syn_classified[which(is.na(ordered$syn_classified))] <- "combined" # Remove double classifications, if present # Find unique trial names trials <- which(!duplicated(ordered$trial)) for (uu in seq_along(trials)) { # Read the classification if (uu0) { temp <- matrix(0, nrow=points, ncol=ncol(x)) colnames(temp) <- colnames(x) for (cc in seq(1, (1+nrow(x)-points), points)) { temp <- temp+x[c(cc:(cc+points-1)), ] } # Divide by the number of cycles to get mean value temp <- temp/(nrow(x)/points) # Minimum subtraction x <- apply(temp, 2, function(y) y-min(y)) # Amplitude normalisation x <- apply(temp, 2, function(y) y/max(y)) } return(data.frame(x)) }) # Remove combined motor modules SYNS_M_all <- lapply(SYNS_M, function(x) { x <- data.frame(x) x[, grep("combined", colnames(x))] <- NULL return(data.frame(x)) }) max_syns <- max(unlist(lapply(SYNS_M_all, function(x) ncol(x)))) conditions <- gsub("^SYNS_ID[0-9]+_", "", names(SYNS_M_all)) conditions <- unique(gsub("_[0-9]+$", "", conditions)) # Progress bar pb <- progress::progress_bar$new(format="[:bar]:percent ETA: :eta", total=length(conditions), clear=F, width=50) message("\nExporting graphs...\n") for (condition in conditions) { pb$tick() Cairo(file=paste0(path_for_graphs, "SYNS_", condition, ".", ty), type=ty, width=wi, height=he, pointsize=mte, dpi=re) SYNS_P_temp <- SYNS_P_all[grep(paste0("_", condition, "_"), names(SYNS_P_all))] SYNS_M_temp <- SYNS_M_all[grep(paste0("_", condition, "_"), names(SYNS_M_all))] varlist <- list() for (syn in 1:max_syns) { # Select relevant synergies data_P <- lapply(SYNS_P_temp, function(x) t(x[, grep(paste0("^Syn", syn), colnames(x))])) data_M <- lapply(SYNS_M_temp, function(x) { muscles <- rownames(x) x <- x[, grep(paste0("^Syn", syn), colnames(x))] if (length(x)>0) names(x) <- muscles t(x) }) # Remove empty trials, if present data_P <- data_P[lapply(data_P, length)>0] data_M <- data_M[lapply(data_M, length)>0] # Put primitives in a single data frame data_P <- plyr::ldply(data_P, data.frame, .id="trial") # Put modules in a single data frame data_M <- plyr::ldply(data_M, data.frame, .id="trial") data_P_av <- data.frame(time=c(1:(ncol(data_P)-1)), value=colMeans(data_P[, -1])) data_P_sd <- data.frame(time=c(1:(ncol(data_P)-1)), ymin=data_P_av$value-apply(data_P[, -1], 2, sd), ymax=data_P_av$value+apply(data_P[, -1], 2, sd)) data_M_av <- data.frame(value=colMeans(data_M[, -1])) data_M_av <- data.frame(muscle=rownames(data_M_av), data_M_av) data_P <- reshape2::melt(data_P, id="trial") data_M <- reshape2::melt(data_M, id="trial") temp_P <- ggplot2::ggplot() + ggplot2::ggtitle(paste0("Motor primitive ", syn)) + ggplot2::ylim(-0.2, 1.2) + ggplot2::geom_ribbon(data=data_P_sd, ggplot2::aes(x=time, ymin=ymin, ymax=ymax), fill="grey80") + ggplot2::geom_line(data=data_P_av, ggplot2::aes(x=time, y=value), colour=c_signal, size=s_line) + ggplot2::theme(axis.title=ggplot2::element_blank(), panel.background=ggplot2::element_rect(fill=c_back, colour=c_bord), panel.grid.major=ggplot2::element_line(colour=c_min, size=s_min), panel.grid.minor=ggplot2::element_blank(), legend.position="none") temp_M <- ggplot2::ggplot() + ggplot2::ggtitle(paste0("Motor module ", syn)) + ggplot2::ylim(0, 1) + ggplot2::geom_hline(yintercept=c(0.25, 0.5, 0.75, 1), size=s_min, color=c_thin) + ggplot2::geom_bar(data=data_M_av, ggplot2::aes(x=muscle, y=value), fill=c_bars, alpha=0.3, stat="identity") + ggplot2::scale_x_discrete(limits=data_M_av$muscle) + ggplot2::geom_jitter(data=data_M, ggplot2::aes(x=variable, y=value), fill=c_bars, width=0.1, size=0.1) + ggplot2::theme(axis.title=ggplot2::element_blank(), axis.text.y=ggplot2::element_blank(), panel.background=ggplot2::element_rect(fill=c_back, colour=c_bord), panel.grid.major=ggplot2::element_blank(), panel.grid.minor=ggplot2::element_blank(), axis.ticks=ggplot2::element_blank(), legend.position="none") varname_P <- paste0("r", 2*syn) varname_M <- paste0("r", 2*syn-1) varlist[[2*syn]] <- assign(varname_P, temp_P) varlist[[2*syn-1]] <- assign(varname_M, temp_M) } suppressWarnings(gridExtra::grid.arrange(grobs=varlist, nrow=max_syns, ncol=2, top=(paste0("Synergies - ", condition)))) dev.off() # Close Cairo export } message("\n...done!")