library(data.table)
library(purrr)
library(MOFA2)
## Warning: replacing previous import 'DelayedArray::pnorm' by 'stats::pnorm'
## when loading 'MOFA2'
This vignette contains a detailed tutorial on how to train MOFA using R. A concise template script can be found here
reticulate
To connect with the Python core, MOFA2
uses the reticulate
package. Most of the times this is straightforward, but it may require some configuration if you have multiple conda environments and versions of Python installed.
library(reticulate)
# Option 1: Using a specific python binary
use_python("/home/user/python", required = TRUE)
# Option 2: Using a conda enviroment called "r-reticulate"
use_condaenv("r-reticulate", required = TRUE)
# If successful, you should be able to run the following two lines without error:
mofa <- import("mofapy2")
mofa_entrypoint <- mofa$run.entry_point$entry_point()
If you have problems, please read the FAQ. If the problem persists, contact us.
Read more about the reticulate package and how it integrates Python and R.
To create a MOFA+ object you need to specify four dimensions: samples (cells), features, view(s) and group(s). MOFA objects can be created from a wide range of input formats, This depends on whether you follow the R path or the Python path:
Simulate data
N = 100
D1 = 250
D2 = 500
# view1 and view2 are matrices with dimensions (N,D1) and (N,D2) where
# N are the samples, D1 and D2 are the number of features in view 1 and 2, respectively
view1 = matrix(rnorm(N*D1),nrow=D1, ncol=N)
view2 = matrix(rnorm(N*D2),nrow=D2, ncol=N)
# Define feature (rows) and sample (columns) names
colnames(view1) <- colnames(view2) <- paste0("sample_",1:N)
rownames(view1) <- paste0("feature",1:D1,"_view",1)
rownames(view2) <- paste0("feature",1:D2,"_view",2)
This is the format inherited from MOFA v1. A list of matrices, where each entry corresponds to one view. Samples are stored in columns and features in rows.
# groups is a character or factor vector that indicates the group ID for each sample
groups = c(rep("A",N/2), rep("B",N/2))
create MOFA object with one view, two groups
MOFAobject <- create_mofa(list("view1" = view1), groups=groups)
create MOFA object with two views, no groups (MOFA v1)
MOFAobject <- create_mofa(list("view1" = view1, "view2" = view2))
create MOFA object with two views, two groups
MOFAobject <- create_mofa(list("view1" = view1, "view2" = view2), groups=groups)
print(MOFAobject)
## Untrained MOFA model with the following characteristics:
## Number of views: 2
## Views names: view1 view2
## Number of features (per view): 250 500
## Number of groups: 2
## Groups names: A B
## Number of samples (per group): 50 50
A long data.frame with columns sample
, group
, feature
, view",
value`.
I think this is the most intuitive format, as it summarises all omics/groups in a single data structure. Also, there is no need to add rows that correspond to missing data.
Create long data.frame format from the matrices:
dt.group <- data.table(sample = paste0("sample_",1:N), group = groups)
# data.frame for view 1
dt1 <- view1 %>% reshape2::melt() %>% as.data.table %>%
setnames(c("feature","sample","value")) %>%
.[,view:="view1"] %>%
merge(dt.group)
dt2 <- view2 %>% reshape2::melt() %>% as.data.table %>%
setnames(c("feature","sample","value")) %>%
.[,view:="view2"] %>%
merge(dt.group)
dt <- rbind(dt1,dt2)
head(dt)
## sample feature value view group
## 1: sample_1 feature1_view1 -0.4149686 view1 A
## 2: sample_1 feature2_view1 -0.8280371 view1 A
## 3: sample_1 feature3_view1 1.6010638 view1 A
## 4: sample_1 feature4_view1 0.1881314 view1 A
## 5: sample_1 feature5_view1 1.2048660 view1 A
## 6: sample_1 feature6_view1 -2.3106304 view1 A
MOFAobject <- create_mofa(dt)
## Creating MOFA object from a data.frame...
print(MOFAobject)
## Untrained MOFA model with the following characteristics:
## Number of views: 2
## Views names: view1 view2
## Number of features (per view): 250 500
## Number of groups: 2
## Groups names: A B
## Number of samples (per group): 50 50
Seurat is a popular tool for the analysis of single-cell omics.
Create a Seurat object with the data
# MOFAobject <- create_mofa(seurat,
# groups = seurat@meta.data$group, # Groups can be extracted from the metadata
# features = VariableFeatures(seurat), # select features from the seurat object
# slot = "data") # select slot from each assay
# print(MOFAobject)
plot_data_overview(MOFAobject)
FALSE
FALSE
data_opts <- get_default_data_options(MOFAobject)
data_opts
## $scale_views
## [1] FALSE
##
## $scale_groups
## [1] FALSE
##
## $views
## [1] "view1" "view2"
##
## $groups
## [1] "A" "B"
FALSE
.TRUE
.TRUE
if using multiple groups.TRUE
.Only change the default model options if you are familiar with the underlying mathematical model!
model_opts <- get_default_model_options(MOFAobject)
head(model_opts)
## $likelihoods
## view1 view2
## "gaussian" "gaussian"
##
## $num_factors
## [1] 15
##
## $spikeslab_factors
## [1] FALSE
##
## $spikeslab_weights
## [1] TRUE
##
## $ard_factors
## [1] TRUE
##
## $ard_weights
## [1] TRUE
train_opts <- get_default_training_options(MOFAobject)
head(train_opts)
## $maxiter
## [1] 5000
##
## $convergence_mode
## [1] "fast"
##
## $verbose
## [1] FALSE
##
## $startELBO
## [1] 1
##
## $freqELBO
## [1] 5
##
## $stochastic
## [1] FALSE
If the number of samples is very large (at the order of >1e4), you may want to try the stochastic inference scheme. If combined with GPUs, it makes inference significantly faster. However, it requires some additional hyperparameters that in some data sets may need to be optimised (vignette in preparation):
stochastic_opts <- get_default_stochastic_options(MOFAobject)
head(stochastic_opts)
## $batch_size
## [1] 0.5
##
## $learning_rate
## [1] 0.75
##
## $forgetting_rate
## [1] 0.1
MOFAobject <- prepare_mofa(
object = MOFAobject,
data_options = data_opts,
model_options = model_opts,
training_options = train_opts
# stochastic_options = stochastic_opts # optional
)
outfile = "/Users/ricard/test.hdf5"
MOFAobject.trained <- run_mofa(MOFAobject, outfile)
If everything is successful, you should observe an output analogous to the following:
######################################
## Training the model with seed 1 ##
######################################
Iteration 1: time=0.03, ELBO=-52650.68, deltaELBO=837116.802 (94.082647669%), Factors=10
(...)
Iteration 9: time=0.04, ELBO=-50114.43, deltaELBO=23.907 (0.002686924%), Factors=10
#######################
## Training finished ##
#######################
Saving model in /Users/ricard/data/mofa2/hdf5/model.hdf5...
This finishes the tutorial on how to train a MOFA object from R. To continue with the downstream analysis, follow this tutorial