################################################################################
# 1. Get the list of packages you have installed,
# use priority to exclude base and recommended packages.
# that may have been distributed with R.
# (pkgList <- installed.packages(priority='NA')[,'Package'])
# 2. Find out which packages are on CRAN and R-Forge. Because
# of R-Forge build capacity is currently limiting the number of
# binaries available, it is queried for source packages only.
# (CRANpkgs <- available.packages(contriburl=contrib.url('http://cran.r-project.org'))[,'Package'])
# (forgePkgs <- available.packages(contriburl=contrib.url('http://r-forge.r-project.org', type='source'))[,'Package'])
# 3. Calculate the set of packages which are installed on your machine,
# not on CRAN but also present on R-Force.
# (pkgsToUp <- intersect(setdiff(pkgList, CRANpkgs), forgePkgs))
# 4. Update the packages, using oldPkgs to restrict the list considered.
# update.packages(checkBuilt=TRUE, ask=FALSE, repos="http://r-forge.r-project.org", oldPkgs=pkgsToUp)
################################################################################
# Update all packages
# update.packages(ask = FALSE, dependencies = c('Suggests'))
source("https://bioconductor.org/biocLite.R")
## Bioconductor version 3.4 (BiocInstaller 1.24.0), ?biocLite for help
## A new version of Bioconductor is available after installing the most
## recent version of R; see http://bioconductor.org/install
if (!require("GEOquery")) {
biocLite("GEOquery")
}
## Loading required package: GEOquery
## Warning: package 'GEOquery' was built under R version 3.3.1
## Loading required package: Biobase
## Warning: package 'Biobase' was built under R version 3.3.1
## Loading required package: BiocGenerics
## Warning: package 'BiocGenerics' was built under R version 3.3.1
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
##
## IQR, mad, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, cbind, colnames,
## do.call, duplicated, eval, evalq, Filter, Find, get, grep,
## grepl, intersect, is.unsorted, lapply, lengths, Map, mapply,
## match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
## Position, rank, rbind, Reduce, rownames, sapply, setdiff,
## sort, table, tapply, union, unique, unsplit, which, which.max,
## which.min
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
## Setting options('download.file.method.GEOquery'='auto')
## Setting options('GEOquery.inmemory.gpl'=FALSE)
library(GEOquery)
if (!require("Biobase")) {
biocLite("Biobase")
}
library(Biobase)
if (!require("limma")) {
biocLite("limma")
}
## Loading required package: limma
## Warning: package 'limma' was built under R version 3.3.3
##
## Attaching package: 'limma'
## The following object is masked from 'package:BiocGenerics':
##
## plotMA
library(limma)
# source("http://bioconductor.org/biocLite.R")
# biocLite("genefilter")
# library(genefilter)
if (!require("plyr")) {
install.packages(pkgs="plyr")
}
## Loading required package: plyr
## Warning: package 'plyr' was built under R version 3.3.3
library(plyr)
# install.packages("XML", repos=c("http://rstudio.org/_packages", "http://cran.rstudio.com"))
# library("XML")
# install.packages("RCurl", repos=c("http://rstudio.org/_packages", "http://cran.rstudio.com"))
# library("RCurl")
# install.packages("rvest", repos=c("http://rstudio.org/_packages", "http://cran.rstudio.com"))
# library("rvest")
getPkgVigs()
## package
## 1 limma
## 2 GEOquery
## 3 Biobase
## 4 Biobase
## 5 Biobase
## 6 parallel
## 7 utils
## filename
## 1 C:/Program Files/R/R-3.3.0/library/limma/doc/intro.pdf
## 2 C:/Program Files/R/R-3.3.0/library/GEOquery/doc/GEOquery.html
## 3 C:/Program Files/R/R-3.3.0/library/Biobase/doc/BiobaseDevelopment.pdf
## 4 C:/Program Files/R/R-3.3.0/library/Biobase/doc/ExpressionSetIntroduction.pdf
## 5 C:/Program Files/R/R-3.3.0/library/Biobase/doc/esApply.pdf
## 6 C:/Program Files/R/R-3.3.0/library/parallel/doc/parallel.pdf
## 7 C:/Program Files/R/R-3.3.0/library/utils/doc/Sweave.pdf
## title
## 1 Limma One Page Introduction
## 2 Using GEOquery
## 3 Notes for eSet developers
## 4 An introduction to Biobase and ExpressionSets
## 5 esApply Introduction
## 6 Package 'parallel'
## 7 Sweave User Manual
http://www.bioconductor.org/packages/release/bioc/manuals/Biobase/man/Biobase.pdf
http://www.bioconductor.org/packages/release/bioc/manuals/GEOquery/man/GEOquery.pdf
GEO Series (GSE), GEO platforms (GPL), and GEO samples (GSM)
The Gene Expression Omnibus (GEO) from NCBI serves as a public repository for a wide range of high-throughput experimental data. These data include single and dual channel microarray-based experiments measuring mRNA, genomic DNA, and protein abundance, as well as non-array techniques such as serial analysis of gene expression (SAGE), and mass spectrometry proteomic data. At the most basic level of organization of GEO, there are three entity types that may be supplied by users: Platforms, Samples, and Series. Additionally, there is a curated entity called a GEO dataset.
GDS-class: A class describing a GEO GDS entity
GEOData-class: virtual class for holding GEO samples, platforms, and datasets
GEODataTable-class: Contains the column descriptions and data for the datatable part of a GEO object
GPL-class: Contains a full GEO Platform entity
GSE-class: Contains a GEO Series entity
GPLList signature(object = “GSE”): returns a list, each item of the list being a GPL object
GSMList signature(object = “GSE”): returns a list, each item of the list being a GSM object
Meta signature(object = “GSE”): returns a list, the metadata associated with the GSE
GSM-class: A class containing a GEO Sample entity
getGEO(GEO = NULL, filename = NULL, destdir = tempdir(), GSElimits=NULL, GSEMatrix=TRUE,AnnotGPL=FALSE,getGPL=TRUE)
This function is the main user-level function in the GEOquery package. It directs the download (if no filename is specified) and parsing of a GEO SOFT format file into an R data structure specifically designed to make access to each of the important parts of the GEO SOFT format easily accessible.
getGEOfile(GEO, destdir = tempdir(), AnnotGPL = FALSE, amount = c(“full”, “brief”, “quick”, “data”))
This function simply downloads a SOFT format file associated with the GEO accession number given.
getGEOSuppFiles(GEO, makeDirectory = TRUE, baseDir = getwd())
NCBI GEO allows supplemental files to be attached to GEO Series (GSE), GEO platforms (GPL), and GEO samples (GSM). This function “knows” how to get these files based on the GEO accession. No parsing of the downloaded files is attempted, since the file format is not generally knowable by the computer.
getGSEDataTables(GSE)
In some cases, instead of individual sample records (GSM) containing information regarding sample phenotypes, the GEO Series contains that information in an attached data table. And example is given by GSE3494 where there are two data tables with important information contained within them. Using getGEO with the standard parameters downloads the GSEMatrix file which, unfortunately, does not contain the information in the data tables. This function simply downloads the “header” information from the GSE record and parses out the data tables into R data.frames.
parseGEO(fname, GSElimits, destdir=tempdir(), AnnotGPL=FALSE, getGPL=TRUE)
parseGPL(fname)
parseGDS(fname)
parseGSE(fname, GSElimits)
parseGSM(fname)
GDS2MA(GDS,do.log2=FALSE,GPL=NULL,AnnotGPL=TRUE,getGPL=TRUE)
GDS2eSet(GDS,do.log2=FALSE,GPL=NULL,AnnotGPL=TRUE,getGPL=TRUE)
gunzip(filename, destname = gsub(“[.]gz$”, “”, filename), overwrite = FALSE, remove = TRUE, …
Gene Expression Omnibus: https://www.ncbi.nlm.nih.gov/geo/
GSE56094
https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4682296/
| Series GSE56094 | Query DataSets for GSE56094 |
|---|---|
| Status | Public on Jun 02, 2015 |
| Title | Transcriptional dynamics driving MAMP-triggered immunity and pathogen effector-mediated immunosuppression in Arabidopsis leaves following infection with Pseudomonas syringae pv. tomato DC3000 |
| Organism | Arabidopsis thaliana |
| Experiment type | Expression profiling by array |
| Summary | High-resolution temporal transcriptomic analysis of Arabidopsis thaliana leaves during infection by Pseudomonas syringae DC3000 and DC3000hrpA-thaliana. Overall design RNA was extracted from Arabidopsis leaves after mock-inoculation, infection with P. syringae DC3000 and infection with P. syringae DC3000hrpA-. 13 time points (0, 2, 3, 4, 6, 7, 8, 10, 11, 12, 14, 16, 17.5 hours post inoculation) were sampled from each treatment and four biological replicates (a single leaf) were harvested at each time point for each treatment. This led to 156 biological samples (13 time points x 3 treatments x 4 biological replicates). Four technical replicates of each biological sample were generated (624). The microarrays were two-channel CATMA arrays and hence two samples are hybridized to each array resulting in 312 arrays used in the experiment. A “loop design” was developed to enable efficient extraction of information about sample comparisons. Half the arrays were used to directly compare samples at different time points within the same treatment, and half directly compared the same time point and adjacent time points between treatments. This design maximizes the power of data comparisons over time and between treatments. |
| Contributor(s) | Lewis LA, Polanski K, de Torres-Zabala M, Jayaraman S, Bowden L, Moore JD, Penfold CA, Jenkins DJ, Hill C, Baxter L, Kulasekaran S, Truman W, Littlejohn G, Pruskinska J, Mead A, Steinbrenner J, Hickman R, Rand D, Wild DL, Ott S, Buchanan-Wollaston V, Smirnoff N, Beynon J, Denby K, Grant M |
| Citation(s) | Lewis LA, Polanski K, de Torres-Zabala M, Jayaraman S et al. Transcriptional Dynamics Driving MAMP-Triggered Immunity and Pathogen Effector-Mediated Immunosuppression in Arabidopsis Leaves Following Infection with Pseudomonas syringae pv tomato DC3000. Plant Cell 2015 Nov;27(11):3038-64. PMID: 26566919 |
The aim of the Complete Arabidopsis Transcriptome MicroArray (CATMA) project was the design and production of high quality Gene-specific Sequence Tags (GSTs) covering most Arabidopsis genes. The GST repertoire is used by numerous groups for the production of DNA arrays for transcript profiling experiments.
CATMA microarrays constitute the foundation of the CAGE project aiming at the construction of a gene expression reference database for Arabidopsis. The CATMA GSTs are also the basic materials in the AGRIKOLA project focusing on the large-scale systematic RNAi silencing of Arabidopsis genes.
Get GEO data
GSE56094
# http://bioconductor.org/packages/release/bioc/vignettes/GEOquery/inst/doc/GEOquery.html
# https://support.bioconductor.org/p/33487/
# you can run limma for differential expression analysis on your object 'gse' which is a list of ExpressionSets,
# which contains one single element because the dataset is not too large.
# This contains normalized log2 ratios after pre-processing done by the authors.
# assumes only one platform in the GSE
gse = getGEO('GSE56094')[[1]]
## Warning in read.table(file = file, header = header, sep = sep, quote =
## quote, : not all columns named in 'colClasses' exist
save (gse, file = 'GEO_GSE56094.RData')
# gse = get(load('GEO_GSE56094.RData'))
class(gse)
## [1] "ExpressionSet"
## attr(,"package")
## [1] "Biobase"
classVersion(gse)
## R Biobase eSet ExpressionSet
## "3.3.1" "2.34.0" "1.3.0" "1.0.0"
description(gse)
## Experiment data
## Experimenter name:
## Laboratory:
## Contact information:
## Title:
## URL:
## PMIDs:
## No abstract available.
show(gse)
## ExpressionSet (storageMode: lockedEnvironment)
## assayData: 30336 features, 156 samples
## element names: exprs
## protocolData: none
## phenoData
## sampleNames: GSM1354976 GSM1354977 ... GSM1355131 (156 total)
## varLabels: title geo_accession ... data_row_count (38 total)
## varMetadata: labelDescription
## featureData
## featureNames: CATMA1a00010 CATMA1a00020 ... CATMA5ctrl30 (30336
## total)
## fvarLabels: ID Chrom ... SPOT_ID (7 total)
## fvarMetadata: Column Description labelDescription
## experimentData: use 'experimentData(object)'
## Annotation: GPL10840
varMetadata(gse)
## labelDescription
## title <NA>
## geo_accession <NA>
## status <NA>
## submission_date <NA>
## last_update_date <NA>
## type <NA>
## channel_count <NA>
## source_name_ch1 <NA>
## organism_ch1 <NA>
## characteristics_ch1 <NA>
## characteristics_ch1.1 <NA>
## characteristics_ch1.2 <NA>
## characteristics_ch1.3 <NA>
## growth_protocol_ch1 <NA>
## molecule_ch1 <NA>
## extract_protocol_ch1 <NA>
## label_ch1 <NA>
## label_protocol_ch1 <NA>
## taxid_ch1 <NA>
## hyb_protocol <NA>
## scan_protocol <NA>
## scan_protocol.1 <NA>
## description <NA>
## data_processing <NA>
## platform_id <NA>
## contact_name <NA>
## contact_email <NA>
## contact_department <NA>
## contact_institute <NA>
## contact_address <NA>
## contact_city <NA>
## contact_zip/postal_code <NA>
## contact_country <NA>
## supplementary_file <NA>
## supplementary_file.1 <NA>
## supplementary_file.2 <NA>
## supplementary_file.3 <NA>
## data_row_count <NA>
protocolData(gse)
## An object of class 'AnnotatedDataFrame': none
# isnorm = getGEO('GSE56094', GSEMatrix=FALSE)
# head(Meta(isnorm))
# names(GSMList(isnorm))
# GSMList(isnorm)[[1]]
featureData(gse)
## An object of class 'AnnotatedDataFrame'
## featureNames: CATMA1a00010 CATMA1a00020 ... CATMA5ctrl30 (30336
## total)
## varLabels: ID Chrom ... SPOT_ID (7 total)
## varMetadata: Column Description labelDescription
(info = varLabels(gse))
## [1] "title" "geo_accession"
## [3] "status" "submission_date"
## [5] "last_update_date" "type"
## [7] "channel_count" "source_name_ch1"
## [9] "organism_ch1" "characteristics_ch1"
## [11] "characteristics_ch1.1" "characteristics_ch1.2"
## [13] "characteristics_ch1.3" "growth_protocol_ch1"
## [15] "molecule_ch1" "extract_protocol_ch1"
## [17] "label_ch1" "label_protocol_ch1"
## [19] "taxid_ch1" "hyb_protocol"
## [21] "scan_protocol" "scan_protocol.1"
## [23] "description" "data_processing"
## [25] "platform_id" "contact_name"
## [27] "contact_email" "contact_department"
## [29] "contact_institute" "contact_address"
## [31] "contact_city" "contact_zip/postal_code"
## [33] "contact_country" "supplementary_file"
## [35] "supplementary_file.1" "supplementary_file.2"
## [37] "supplementary_file.3" "data_row_count"
for (i in info) {
if (!grepl('/', i)) {
cat(i, '::\t')
print(eval(parse(text = paste0('gse$',i,'[[1]]'))))
cat('\n')
}
}
## title :: [1] Mock 1 hours biorep A
## 156 Levels: Mock 1 hours biorep A ... P. syringae pv tomato DC3000 wildtype 9 hours biorep D
##
## geo_accession :: [1] GSM1354976
## 156 Levels: GSM1354976 GSM1354977 GSM1354978 GSM1354979 ... GSM1355131
##
## status :: [1] Public on Jun 02 2015
## Levels: Public on Jun 02 2015
##
## submission_date :: [1] Mar 21 2014
## Levels: Mar 21 2014
##
## last_update_date :: [1] Jun 03 2015
## Levels: Jun 03 2015
##
## type :: [1] RNA
## Levels: RNA
##
## channel_count :: [1] 1
## Levels: 1
##
## source_name_ch1 :: [1] Mock 1 hours biorep A
## 156 Levels: Mock 1 hours biorep A ... P. syringae pv tomato DC3000 wildtype 9 hours biorep D
##
## organism_ch1 :: [1] Arabidopsis thaliana
## Levels: Arabidopsis thaliana
##
## characteristics_ch1 :: [1] strain: Col0
## Levels: strain: Col0
##
## characteristics_ch1.1 :: [1] treatment: Mock
## 3 Levels: treatment: Infected P. syringae pv tomato DC3000 hrpA- mutant ...
##
## characteristics_ch1.2 :: [1] hours_since_infection: 1
## 13 Levels: hours_since_infection: 1 ... hours_since_infection: 9
##
## characteristics_ch1.3 :: [1] tissue: leaf
## Levels: tissue: leaf
##
## growth_protocol_ch1 :: [1] Arabidopsis plants Col-0 were grown under a 16:8 hr light:dark cycle (lights on 04:00 to 20:00) at 20°C, 60% humidity and light intensity of 100 μmol photons.m-2.s-1. Arabidopsis seed was stratified for three days in 0.1% agar at 4°C before sowing onto Arabidopsis soil mix (Scotts Levingtons F2s compost:sand:fine grade vermiculite in a ratio of 6:1:1).
## Levels: Arabidopsis plants Col-0 were grown under a 16:8 hr light:dark cycle (lights on 04:00 to 20:00) at 20°C, 60% humidity and light intensity of 100 μmol photons.m-2.s-1. Arabidopsis seed was stratified for three days in 0.1% agar at 4°C before sowing onto Arabidopsis soil mix (Scotts Levingtons F2s compost:sand:fine grade vermiculite in a ratio of 6:1:1).
##
## molecule_ch1 :: [1] total RNA
## Levels: total RNA
##
## extract_protocol_ch1 :: [1] Total RNA was isolated from four individual leaves from each sampled time point (arbitrarily labelled as biological replicates A, B, C and D) using TRIzol reagent (Invitrogen), purified with RNeasy columns (Qiagen) and amplified using the MessageAmp II aRNA Amplification kit (Ambion) in accordance with the kit protocol with a single round of amplification.
## Levels: Total RNA was isolated from four individual leaves from each sampled time point (arbitrarily labelled as biological replicates A, B, C and D) using TRIzol reagent (Invitrogen), purified with RNeasy columns (Qiagen) and amplified using the MessageAmp II aRNA Amplification kit (Ambion) in accordance with the kit protocol with a single round of amplification.
##
## label_ch1 :: [1] Cy3 or Cy5
## Levels: Cy3 or Cy5
##
## label_protocol_ch1 :: [1] Cy3 and Cy5 labelled cDNA probes were prepared by reverse transcribing 5µg of aRNA with Cy3- or Cy5- dCTP (GE Healthcare) and a modified dNTP mix (10mM each dATP, dGTP and dTTP; 2mM dCTP) using random primers (Invitrogen) and SuperScript II reverse transcriptase (Invitrogen), with the inclusion of RNase inhibitor (RNaseOUT, Invitrogen) and DTT. Labelled probes were purified using QiaQuick PCR Purification columns (Qiagen), freeze-dried, and resuspended in 50µl hybridization buffer (25% formamide, 5xSSC, 0.1% SDS, 0.5µg/µl yeast tRNA [Invitrogen]).
## Levels: Cy3 and Cy5 labelled cDNA probes were prepared by reverse transcribing 5µg of aRNA with Cy3- or Cy5- dCTP (GE Healthcare) and a modified dNTP mix (10mM each dATP, dGTP and dTTP; 2mM dCTP) using random primers (Invitrogen) and SuperScript II reverse transcriptase (Invitrogen), with the inclusion of RNase inhibitor (RNaseOUT, Invitrogen) and DTT. Labelled probes were purified using QiaQuick PCR Purification columns (Qiagen), freeze-dried, and resuspended in 50µl hybridization buffer (25% formamide, 5xSSC, 0.1% SDS, 0.5µg/µl yeast tRNA [Invitrogen]).
##
## taxid_ch1 :: [1] 3702
## Levels: 3702
##
## hyb_protocol :: [1] The order in which the 288 slides were hybridized and scanned was randomised to minimise the impact on differences between samples of any potential variation in the processing conditions. According to this randomised experimental design combinations of labelled samples were hybridized to slides overnight at 42oC.
## Levels: The order in which the 288 slides were hybridized and scanned was randomised to minimise the impact on differences between samples of any potential variation in the processing conditions. According to this randomised experimental design combinations of labelled samples were hybridized to slides overnight at 42oC.
##
## scan_protocol :: [1] Following hybridization, slides were washed and scanned using an Affymetrix 428 array scanner at 532nm (Cy3) and 635nm (Cy5).
## Levels: Following hybridization, slides were washed and scanned using an Affymetrix 428 array scanner at 532nm (Cy3) and 635nm (Cy5).
##
## scan_protocol.1 :: [1] Scanned data were quantified using Imagene 7.5.0 software (BioDiscovery, Inc.).
## Levels: Scanned data were quantified using Imagene 7.5.0 software (BioDiscovery, Inc.).
##
## description :: [1] Randomised loop design providing dye-swaps between treatments, time points and bioreps
## Levels: Randomised loop design providing dye-swaps between treatments, time points and bioreps
##
## data_processing :: [1] Quantified microarray data were Lowess normalized within pintip groups and within arrays, then variation due to arrays and dyes was removed by a random effects model, and averaged, in log space, using R/MAANOVA (MicroArray ANalysis Of VAriance) (Wu et al. 2003). Oligo sequences of CATMA array probes were mapped to individual mRNA sequences of transcripts from the TAIR9 genome assembly using BLASTn with e-value cutoff of 0.01. Additionally, alignments were filtered to exclude alignments shorter than 30bp or with less than 80% sequence identity.
## Levels: Quantified microarray data were Lowess normalized within pintip groups and within arrays, then variation due to arrays and dyes was removed by a random effects model, and averaged, in log space, using R/MAANOVA (MicroArray ANalysis Of VAriance) (Wu et al. 2003). Oligo sequences of CATMA array probes were mapped to individual mRNA sequences of transcripts from the TAIR9 genome assembly using BLASTn with e-value cutoff of 0.01. Additionally, alignments were filtered to exclude alignments shorter than 30bp or with less than 80% sequence identity.
##
## platform_id :: [1] GPL10840
## Levels: GPL10840
##
## contact_name :: [1] Jonathan,David,Moore
## Levels: Jonathan,David,Moore
##
## contact_email :: [1] jonathan.moore@tgac.ac.uk
## Levels: jonathan.moore@tgac.ac.uk
##
## contact_department :: [1] Platforms and Pipelines
## Levels: Platforms and Pipelines
##
## contact_institute :: [1] The Genome Analysis Centre
## Levels: The Genome Analysis Centre
##
## contact_address :: [1] Norwich Research Park
## Levels: Norwich Research Park
##
## contact_city :: [1] Norwich
## Levels: Norwich
##
## contact_country :: [1] United Kingdom
## Levels: United Kingdom
##
## supplementary_file :: [1] ftp://ftp.ncbi.nlm.nih.gov/pub/geo/DATA/supplementary/samples/GSM1354nnn/GSM1354976/GSM1354976_LB_13781092_Cy5.txt.gz
## 156 Levels: ftp://ftp.ncbi.nlm.nih.gov/pub/geo/DATA/supplementary/samples/GSM1354nnn/GSM1354976/GSM1354976_LB_13781092_Cy5.txt.gz ...
##
## supplementary_file.1 :: [1] ftp://ftp.ncbi.nlm.nih.gov/pub/geo/DATA/supplementary/samples/GSM1354nnn/GSM1354976/GSM1354976_LB_13781096_Cy3.txt.gz
## 156 Levels: ftp://ftp.ncbi.nlm.nih.gov/pub/geo/DATA/supplementary/samples/GSM1354nnn/GSM1354976/GSM1354976_LB_13781096_Cy3.txt.gz ...
##
## supplementary_file.2 :: [1] ftp://ftp.ncbi.nlm.nih.gov/pub/geo/DATA/supplementary/samples/GSM1354nnn/GSM1354976/GSM1354976_LB_13781649_Cy5.txt.gz
## 156 Levels: ftp://ftp.ncbi.nlm.nih.gov/pub/geo/DATA/supplementary/samples/GSM1354nnn/GSM1354976/GSM1354976_LB_13781649_Cy5.txt.gz ...
##
## supplementary_file.3 :: [1] ftp://ftp.ncbi.nlm.nih.gov/pub/geo/DATA/supplementary/samples/GSM1354nnn/GSM1354976/GSM1354976_LB_13876881_Cy3.txt.gz
## 156 Levels: ftp://ftp.ncbi.nlm.nih.gov/pub/geo/DATA/supplementary/samples/GSM1354nnn/GSM1354976/GSM1354976_LB_13876881_Cy3.txt.gz ...
##
## data_row_count :: [1] 30336
## Levels: 30336
fvarMetadata(gse)
## Column
## ID ID
## Chrom Chrom
## TAIR9_best TAIR9_best
## TAIR9_others TAIR9_others
## SEQUENCE SEQUENCE
## ORF ORF
## SPOT_ID SPOT_ID
## Description
## ID
## Chrom TAIR9 pseudochromosome containing best BLAST alignment to probe sequence
## TAIR9_best TAIR9 gene model whose mRNA sequence contains best BLAST alignment to probe sequence
## TAIR9_others Other TAIR9 gene models with alignments to probe sequence
## SEQUENCE Probe sequece
## ORF LINK_PRE:"http://www.ncbi.nlm.nih.gov/gene/?term=" LINK_SUF:"[gene name]"
## SPOT_ID
## labelDescription
## ID <NA>
## Chrom <NA>
## TAIR9_best <NA>
## TAIR9_others <NA>
## SEQUENCE <NA>
## ORF <NA>
## SPOT_ID <NA>
head(fData(gse))
## ID Chrom TAIR9_best TAIR9_others
## CATMA1a00010 CATMA1a00010 1 AT1G01010
## CATMA1a00020 CATMA1a00020 1 AT1G01020 AT1G05400
## CATMA1a00030 CATMA1a00030 1 AT1G01030
## CATMA1a00035 CATMA1a00035 1 AT1G01040
## CATMA1a00040 CATMA1a00040 1 AT1G01050
## CATMA1a00045 CATMA1a00045 1 AT1G01060
## SEQUENCE
## CATMA1a00010 TCAACCAATCACGTCAACGAAATTCAGGATCTTACAACACTTACTCTGAGTATGATTCAGCAAATCATGGCCAGCAGTTTAATGAAAACTCTAACATTATGCAGCAGCAACCACTTCAAGGATCATTCAACCCTCTCCTTGAGTATGATTTTGCAAATCACGGCGGTCAGTGGCTGAGTGACTATATCGACCTGCAACAGCAAGTTCCTTACTTGGCACCTTATGAAAATGAGTCGGAGATGATTTGGAAGCATGTGATTGAAGAA
## CATMA1a00020 GTGGTGGTACCCTCCTTGGAATGGTGGGCATATGTGGAACCCTCCTATGAATGTTGGAGATATGTGGGACTCTCCTATGAATGTGGGGCATCAACACATTGTTGCAACACCAAGTGAATGTCATAGGAAAGACGAGGAGACGGTTGAAGCCGAAGTTGAGGCTCCCCTTGATTCTTATCCGGAGAATGCTACTACCATGGCGAGAAAGCAAC
## CATMA1a00030 CCAAACTTTACATAGATTGGAGGCATAGACCCGACATGAGCCTCGTTCAAGCACATCAGTTTGGTAATTTTGGTTTCAATTTCAATTTCCCGACCACTTCTCAATATTCCAACAGATTTCATCCATTGCCAGAATATAACTCCGTCCCGATTCACCGGGGCTTAAACATCGGAAATCACCAACGTTCCTATTATAACACCCAGCGTCAAGAGTTCGTAG
## CATMA1a00035 GGCTCATGCAAAGAGATTTACGTTTGGGGTAAGAGTTAATACGAGCGATAGAGGATGGACCGATGAGTGTATTGGCGAGCCAATGCCGAGTGTTAAGAAAGCTAAGGATTCAGCTGCGGTTCTTCTACTTGAGCTTTTAAATAAAACTTTTTCTTGATTCTTTTACTCTCTTCAACGAGATGTAGTCATTACATTTTAAACCTTAAAACCATAGTGGTTG
## CATMA1a00040 AGCTTCTCCTCAGAAGATTTCTGCAGCATCTATGTTTCTGTTACTTTTCATTGATAATAAGAAGCATATGATCCTAATTGATATTACTTTTATCTCTGCATTTTTCTTCCTTATCATTTGGTCCTTCTGTAATTTCGTTTTAAAGACCCT
## CATMA1a00045 TCCTAATTTCAAAAGCCAGGATTCTTGTGCTGCAGACCAAGAAGGAGTAGTAATGATCGGTGTTGGAACATGCAAGAGTCTTAAAACGAGACAGACAGGATTTAAGCCATACAAGAGATGTTCAATGGAAGTGAAAGAGAGCCAAGTTGGGAACATAAACAATCAAAGTGATGAAAAAGTCTGCAAAAGGCTTCGATTGGAAGGAGAAGCTTCTACATGACAGACTTGGAGGTAAAAAAAAAACATCCACATTTTTATCAATATCTTTAAATCTAGTGTTAGTAGTTTGCTTCTCCAATCTTTATGAAAGAGACTTTTAATTTTCCTTCCGAACATTTCTTTGGTCATGTCAGGTTCTGTACCATATTACCCCATGTCTTGTCTCTTGTCTCTGTTTG
## ORF SPOT_ID
## CATMA1a00010 AT1G01010
## CATMA1a00020 AT1G01020
## CATMA1a00030 AT1G01030
## CATMA1a00035 AT1G01040
## CATMA1a00040 AT1G01050
## CATMA1a00045 AT1G01060
fvarLabels(gse)
## [1] "ID" "Chrom" "TAIR9_best" "TAIR9_others"
## [5] "SEQUENCE" "ORF" "SPOT_ID"
Translation table: CATMA to Ath gene
catma = fData(gse)
cColNames = fvarLabels(gse)
ncol = length(cColNames)
dim(catma)
## [1] 30336 7
catma[catma==""]<-NA
numNAs <- apply(catma, 1, function(x) sum(is.na(x)))
length(numNAs) == dim(catma)[1]
## [1] TRUE
sum(is.na(catma$TAIR9_best))
## [1] 2728
# keep ones that have assigned Ath gene
catma = catma[!is.na(catma$TAIR9_best), ]
dim(catma)
## [1] 27608 7
Get expression values
# write.table(data.frame(fData(gse),exprs(gse)),sep="\t",row.names=FALSE,file='testGSE56094Table.txt')
myE = exprs(gse)
head(myE)[,1:6]
## GSM1354976 GSM1354977 GSM1354978 GSM1354979 GSM1354980
## CATMA1a00010 9.429444 11.295529 10.720168 11.131236 10.630403
## CATMA1a00020 7.983924 7.732327 7.730905 7.942637 7.696408
## CATMA1a00030 8.298806 8.758403 8.341977 8.835087 8.293516
## CATMA1a00035 10.233715 10.668946 9.881458 10.483725 10.122867
## CATMA1a00040 11.137192 10.798991 10.760669 11.094889 11.641954
## CATMA1a00045 9.907937 9.546775 9.292673 8.630197 8.611419
## GSM1354981
## CATMA1a00010 10.856614
## CATMA1a00020 7.968995
## CATMA1a00030 8.387448
## CATMA1a00035 10.275984
## CATMA1a00040 11.872051
## CATMA1a00045 8.989938
dim(myE)
## [1] 30336 156
nold = dim(myE)[2]
Plot values per experiment
# par(mfrow=c(2,1))
plot(density(myE[,1]), col=1, xlim=c(0,20), ylim=c(0,0.60), main = 'Kernel Density Estimation')
for (n in 2:nold) {lines(density(myE[,n]), col=n, xlim=c(0,20), ylim=c(0,0.60))}
boxplot(as.data.frame(myE[,1:nold]), col=rainbow(24), main= "Normalised data", las=3, cex.axis = 0.5)
# par(mfrow=c(1,1))
Extract expression data for wanted experimets
| Order | Sample ID | Sample description | Order | Sample ID | Sample description | Order | Sample ID | Sample description |
|---|---|---|---|---|---|---|---|---|
| 1 | GSM1354976 | Mock 1 hours biorep A | 53 | GSM1355028 | P. syringae pv tomato DC3000 hrpA- mutant 1 hours biorep A | 105 | GSM1355080 | P. syringae pv tomato DC3000 wildtype 1 hours biorep A |
| 2 | GSM1354977 | Mock 2 hours biorep A | 54 | GSM1355029 | P. syringae pv tomato DC3000 hrpA- mutant 2 hours biorep A | 106 | GSM1355081 | P. syringae pv tomato DC3000 wildtype 2 hours biorep A |
| 3 | GSM1354978 | Mock 3 hours biorep A | 55 | GSM1355030 | P. syringae pv tomato DC3000 hrpA- mutant 3 hours biorep A | 107 | GSM1355082 | P. syringae pv tomato DC3000 wildtype 3 hours biorep A |
| 4 | GSM1354979 | Mock 4 hours biorep A | 56 | GSM1355031 | P. syringae pv tomato DC3000 hrpA- mutant 4 hours biorep A | 108 | GSM1355083 | P. syringae pv tomato DC3000 wildtype 4 hours biorep A |
| 5 | GSM1354980 | Mock 5 hours biorep A | 57 | GSM1355032 | P. syringae pv tomato DC3000 hrpA- mutant 5 hours biorep A | 109 | GSM1355084 | P. syringae pv tomato DC3000 wildtype 5 hours biorep A |
| 6 | GSM1354981 | Mock 6 hours biorep A | 58 | GSM1355033 | P. syringae pv tomato DC3000 hrpA- mutant 6 hours biorep A | 110 | GSM1355085 | P. syringae pv tomato DC3000 wildtype 6 hours biorep A |
| 7 | GSM1354982 | Mock 7 hours biorep A | 59 | GSM1355034 | P. syringae pv tomato DC3000 hrpA- mutant 7 hours biorep A | 111 | GSM1355086 | P. syringae pv tomato DC3000 wildtype 7 hours biorep A |
| 8 | GSM1354983 | Mock 8 hours biorep A | 60 | GSM1355035 | P. syringae pv tomato DC3000 hrpA- mutant 8 hours biorep A | 112 | GSM1355087 | P. syringae pv tomato DC3000 wildtype 8 hours biorep A |
| 9 | GSM1354984 | Mock 9 hours biorep A | 61 | GSM1355036 | P. syringae pv tomato DC3000 hrpA- mutant 9 hours biorep A | 113 | GSM1355088 | P. syringae pv tomato DC3000 wildtype 9 hours biorep A |
| 10 | GSM1354985 | Mock 10 hours biorep A | 62 | GSM1355037 | P. syringae pv tomato DC3000 hrpA- mutant 10 hours biorep A | 114 | GSM1355089 | P. syringae pv tomato DC3000 wildtype 10 hours biorep A |
| 11 | GSM1354986 | Mock 11 hours biorep A | 63 | GSM1355038 | P. syringae pv tomato DC3000 hrpA- mutant 11 hours biorep A | 115 | GSM1355090 | P. syringae pv tomato DC3000 wildtype 11 hours biorep A |
| 12 | GSM1354987 | Mock 12 hours biorep A | 64 | GSM1355039 | P. syringae pv tomato DC3000 hrpA- mutant 12 hours biorep A | 116 | GSM1355091 | P. syringae pv tomato DC3000 wildtype 12 hours biorep A |
| 13 | GSM1354988 | Mock 13 hours biorep A | 65 | GSM1355040 | P. syringae pv tomato DC3000 hrpA- mutant 13 hours biorep A | 117 | GSM1355092 | P. syringae pv tomato DC3000 wildtype 13 hours biorep A |
| 14 | GSM1354989 | Mock 1 hours biorep B | 66 | GSM1355041 | P. syringae pv tomato DC3000 hrpA- mutant 1 hours biorep B | 118 | GSM1355093 | P. syringae pv tomato DC3000 wildtype 1 hours biorep B |
| 15 | GSM1354990 | Mock 2 hours biorep B | 67 | GSM1355042 | P. syringae pv tomato DC3000 hrpA- mutant 2 hours biorep B | 119 | GSM1355094 | P. syringae pv tomato DC3000 wildtype 2 hours biorep B |
| 16 | GSM1354991 | Mock 3 hours biorep B | 68 | GSM1355043 | P. syringae pv tomato DC3000 hrpA- mutant 3 hours biorep B | 120 | GSM1355095 | P. syringae pv tomato DC3000 wildtype 3 hours biorep B |
| 17 | GSM1354992 | Mock 4 hours biorep B | 69 | GSM1355044 | P. syringae pv tomato DC3000 hrpA- mutant 4 hours biorep B | 121 | GSM1355096 | P. syringae pv tomato DC3000 wildtype 4 hours biorep B |
| 18 | GSM1354993 | Mock 5 hours biorep B | 70 | GSM1355045 | P. syringae pv tomato DC3000 hrpA- mutant 5 hours biorep B | 122 | GSM1355097 | P. syringae pv tomato DC3000 wildtype 5 hours biorep B |
| 19 | GSM1354994 | Mock 6 hours biorep B | 71 | GSM1355046 | P. syringae pv tomato DC3000 hrpA- mutant 6 hours biorep B | 123 | GSM1355098 | P. syringae pv tomato DC3000 wildtype 6 hours biorep B |
| 20 | GSM1354995 | Mock 7 hours biorep B | 72 | GSM1355047 | P. syringae pv tomato DC3000 hrpA- mutant 7 hours biorep B | 124 | GSM1355099 | P. syringae pv tomato DC3000 wildtype 7 hours biorep B |
| 21 | GSM1354996 | Mock 8 hours biorep B | 73 | GSM1355048 | P. syringae pv tomato DC3000 hrpA- mutant 8 hours biorep B | 125 | GSM1355100 | P. syringae pv tomato DC3000 wildtype 8 hours biorep B |
| 22 | GSM1354997 | Mock 9 hours biorep B | 74 | GSM1355049 | P. syringae pv tomato DC3000 hrpA- mutant 9 hours biorep B | 126 | GSM1355101 | P. syringae pv tomato DC3000 wildtype 9 hours biorep B |
| 23 | GSM1354998 | Mock 10 hours biorep B | 75 | GSM1355050 | P. syringae pv tomato DC3000 hrpA- mutant 10 hours biorep B | 127 | GSM1355102 | P. syringae pv tomato DC3000 wildtype 10 hours biorep B |
| 24 | GSM1354999 | Mock 11 hours biorep B | 76 | GSM1355051 | P. syringae pv tomato DC3000 hrpA- mutant 11 hours biorep B | 128 | GSM1355103 | P. syringae pv tomato DC3000 wildtype 11 hours biorep B |
| 25 | GSM1355000 | Mock 12 hours biorep B | 77 | GSM1355052 | P. syringae pv tomato DC3000 hrpA- mutant 12 hours biorep B | 129 | GSM1355104 | P. syringae pv tomato DC3000 wildtype 12 hours biorep B |
| 26 | GSM1355001 | Mock 13 hours biorep B | 78 | GSM1355053 | P. syringae pv tomato DC3000 hrpA- mutant 13 hours biorep B | 130 | GSM1355105 | P. syringae pv tomato DC3000 wildtype 13 hours biorep B |
| 27 | GSM1355002 | Mock 1 hours biorep C | 79 | GSM1355054 | P. syringae pv tomato DC3000 hrpA- mutant 1 hours biorep C | 131 | GSM1355106 | P. syringae pv tomato DC3000 wildtype 1 hours biorep C |
| 28 | GSM1355003 | Mock 2 hours biorep C | 80 | GSM1355055 | P. syringae pv tomato DC3000 hrpA- mutant 2 hours biorep C | 132 | GSM1355107 | P. syringae pv tomato DC3000 wildtype 2 hours biorep C |
| 29 | GSM1355004 | Mock 3 hours biorep C | 81 | GSM1355056 | P. syringae pv tomato DC3000 hrpA- mutant 3 hours biorep C | 133 | GSM1355108 | P. syringae pv tomato DC3000 wildtype 3 hours biorep C |
| 30 | GSM1355005 | Mock 4 hours biorep C | 82 | GSM1355057 | P. syringae pv tomato DC3000 hrpA- mutant 4 hours biorep C | 134 | GSM1355109 | P. syringae pv tomato DC3000 wildtype 4 hours biorep C |
| 31 | GSM1355006 | Mock 5 hours biorep C | 83 | GSM1355058 | P. syringae pv tomato DC3000 hrpA- mutant 5 hours biorep C | 135 | GSM1355110 | P. syringae pv tomato DC3000 wildtype 5 hours biorep C |
| 32 | GSM1355007 | Mock 6 hours biorep C | 84 | GSM1355059 | P. syringae pv tomato DC3000 hrpA- mutant 6 hours biorep C | 136 | GSM1355111 | P. syringae pv tomato DC3000 wildtype 6 hours biorep C |
| 33 | GSM1355008 | Mock 7 hours biorep C | 85 | GSM1355060 | P. syringae pv tomato DC3000 hrpA- mutant 7 hours biorep C | 137 | GSM1355112 | P. syringae pv tomato DC3000 wildtype 7 hours biorep C |
| 34 | GSM1355009 | Mock 8 hours biorep C | 86 | GSM1355061 | P. syringae pv tomato DC3000 hrpA- mutant 8 hours biorep C | 138 | GSM1355113 | P. syringae pv tomato DC3000 wildtype 8 hours biorep C |
| 35 | GSM1355010 | Mock 9 hours biorep C | 87 | GSM1355062 | P. syringae pv tomato DC3000 hrpA- mutant 9 hours biorep C | 139 | GSM1355114 | P. syringae pv tomato DC3000 wildtype 9 hours biorep C |
| 36 | GSM1355011 | Mock 10 hours biorep C | 88 | GSM1355063 | P. syringae pv tomato DC3000 hrpA- mutant 10 hours biorep C | 140 | GSM1355115 | P. syringae pv tomato DC3000 wildtype 10 hours biorep C |
| 37 | GSM1355012 | Mock 11 hours biorep C | 89 | GSM1355064 | P. syringae pv tomato DC3000 hrpA- mutant 11 hours biorep C | 141 | GSM1355116 | P. syringae pv tomato DC3000 wildtype 11 hours biorep C |
| 38 | GSM1355013 | Mock 12 hours biorep C | 90 | GSM1355065 | P. syringae pv tomato DC3000 hrpA- mutant 12 hours biorep C | 142 | GSM1355117 | P. syringae pv tomato DC3000 wildtype 12 hours biorep C |
| 39 | GSM1355014 | Mock 13 hours biorep C | 91 | GSM1355066 | P. syringae pv tomato DC3000 hrpA- mutant 13 hours biorep C | 143 | GSM1355118 | P. syringae pv tomato DC3000 wildtype 13 hours biorep C |
| 40 | GSM1355015 | Mock 1 hours biorep D | 92 | GSM1355067 | P. syringae pv tomato DC3000 hrpA- mutant 1 hours biorep D | 144 | GSM1355119 | P. syringae pv tomato DC3000 wildtype 1 hours biorep D |
| 41 | GSM1355016 | Mock 2 hours biorep D | 93 | GSM1355068 | P. syringae pv tomato DC3000 hrpA- mutant 2 hours biorep D | 145 | GSM1355120 | P. syringae pv tomato DC3000 wildtype 2 hours biorep D |
| 42 | GSM1355017 | Mock 3 hours biorep D | 94 | GSM1355069 | P. syringae pv tomato DC3000 hrpA- mutant 3 hours biorep D | 146 | GSM1355121 | P. syringae pv tomato DC3000 wildtype 3 hours biorep D |
| 43 | GSM1355018 | Mock 4 hours biorep D | 95 | GSM1355070 | P. syringae pv tomato DC3000 hrpA- mutant 4 hours biorep D | 147 | GSM1355122 | P. syringae pv tomato DC3000 wildtype 4 hours biorep D |
| 44 | GSM1355019 | Mock 5 hours biorep D | 96 | GSM1355071 | P. syringae pv tomato DC3000 hrpA- mutant 5 hours biorep D | 148 | GSM1355123 | P. syringae pv tomato DC3000 wildtype 5 hours biorep D |
| 45 | GSM1355020 | Mock 6 hours biorep D | 97 | GSM1355072 | P. syringae pv tomato DC3000 hrpA- mutant 6 hours biorep D | 149 | GSM1355124 | P. syringae pv tomato DC3000 wildtype 6 hours biorep D |
| 46 | GSM1355021 | Mock 7 hours biorep D | 98 | GSM1355073 | P. syringae pv tomato DC3000 hrpA- mutant 7 hours biorep D | 150 | GSM1355125 | P. syringae pv tomato DC3000 wildtype 7 hours biorep D |
| 47 | GSM1355022 | Mock 8 hours biorep D | 99 | GSM1355074 | P. syringae pv tomato DC3000 hrpA- mutant 8 hours biorep D | 151 | GSM1355126 | P. syringae pv tomato DC3000 wildtype 8 hours biorep D |
| 48 | GSM1355023 | Mock 9 hours biorep D | 100 | GSM1355075 | P. syringae pv tomato DC3000 hrpA- mutant 9 hours biorep D | 152 | GSM1355127 | P. syringae pv tomato DC3000 wildtype 9 hours biorep D |
| 49 | GSM1355024 | Mock 10 hours biorep D | 101 | GSM1355076 | P. syringae pv tomato DC3000 hrpA- mutant 10 hours biorep D | 153 | GSM1355128 | P. syringae pv tomato DC3000 wildtype 10 hours biorep D |
| 50 | GSM1355025 | Mock 11 hours biorep D | 102 | GSM1355077 | P. syringae pv tomato DC3000 hrpA- mutant 11 hours biorep D | 154 | GSM1355129 | P. syringae pv tomato DC3000 wildtype 11 hours biorep D |
| 51 | GSM1355026 | Mock 12 hours biorep D | 103 | GSM1355078 | P. syringae pv tomato DC3000 hrpA- mutant 12 hours biorep D | 155 | GSM1355130 | P. syringae pv tomato DC3000 wildtype 12 hours biorep D |
| 52 | GSM1355027 | Mock 13 hours biorep D | 104 | GSM1355079 | P. syringae pv tomato DC3000 hrpA- mutant 13 hours biorep D | 156 | GSM1355131 | P. syringae pv tomato DC3000 wildtype 13 hours biorep D |
# gse$title
# select only mock and DC3000 wt
myE = myE[,c(1:(13*4),((13*4*2+1):(13*4*3)))]
dim(myE)
## [1] 30336 104
nold = dim(myE)[2]
# find match per CATMA
length(rownames(catma)) == length(unique(rownames(catma)))
## [1] TRUE
ind = match(rownames(myE),rownames(catma))
myE = cbind(myE,catma[ind,])
dim(myE)
## [1] 30336 111
# backup copy of expression file merged with pheno data
myEextended = myE
# find duplicated probes (one probe per gene)
# gives you a data frame with a list of ids and the number of times they occurred
n_occur <- data.frame(table(myE[,(nold+3)]))
# tells you which ids occurred more than once
duplicated = n_occur[n_occur$Freq > 1,]
# dim(duplicated)
sum(duplicated[,2])
## [1] 7000
hist(duplicated[,2])
# returns the records with more than one occurrence
duplicatedID = myE[myE[,(nold+3)] %in% n_occur$Var1[n_occur$Freq > 1],]
dim(duplicatedID)
## [1] 7000 111
length(unique(duplicatedID[,(nold+3)])) == dim(duplicated)[1]
## [1] TRUE
myE = myE[,c(1:nold, nold+1, nold+3)]
# remove probes that are not annotated with Ath gene
myE = myE[!is.na(myE[,dim(myE)[2]]),]
any(is.na(myE[,dim(myE)[2]]))
## [1] FALSE
dim(myE)
## [1] 27608 106
# average probe expression for ones that are matching the same gene
X = aggregate(myE[,1:nold], by=list(myE$TAIR9_best), FUN=mean)
dim(X)[1] == length(unique(myE$TAIR9_best))
## [1] TRUE
rownames(X) = X$Group.1
newCatma = rownames(X)
length(newCatma) == length(unique(newCatma))
## [1] TRUE
any(colnames(X)[2:dim(X)[2]] != colnames(myE)[1:nold])
## [1] FALSE
myE = X[,2:dim(X)[2]]
Phenodata
dim((pData(phenoData(gse))))
## [1] 156 38
colnames((pData(phenoData(gse))))
## [1] "title" "geo_accession"
## [3] "status" "submission_date"
## [5] "last_update_date" "type"
## [7] "channel_count" "source_name_ch1"
## [9] "organism_ch1" "characteristics_ch1"
## [11] "characteristics_ch1.1" "characteristics_ch1.2"
## [13] "characteristics_ch1.3" "growth_protocol_ch1"
## [15] "molecule_ch1" "extract_protocol_ch1"
## [17] "label_ch1" "label_protocol_ch1"
## [19] "taxid_ch1" "hyb_protocol"
## [21] "scan_protocol" "scan_protocol.1"
## [23] "description" "data_processing"
## [25] "platform_id" "contact_name"
## [27] "contact_email" "contact_department"
## [29] "contact_institute" "contact_address"
## [31] "contact_city" "contact_zip/postal_code"
## [33] "contact_country" "supplementary_file"
## [35] "supplementary_file.1" "supplementary_file.2"
## [37] "supplementary_file.3" "data_row_count"
show(pData(phenoData(gse))[1:5,c(1,6,8)])
## title type source_name_ch1
## GSM1354976 Mock 1 hours biorep A RNA Mock 1 hours biorep A
## GSM1354977 Mock 2 hours biorep A RNA Mock 2 hours biorep A
## GSM1354978 Mock 3 hours biorep A RNA Mock 3 hours biorep A
## GSM1354979 Mock 4 hours biorep A RNA Mock 4 hours biorep A
## GSM1354980 Mock 5 hours biorep A RNA Mock 5 hours biorep A
myP = (pData(phenoData(gse)))[,c(1,6,8)]
dim(myP)
## [1] 156 3
head(myP)
## title type source_name_ch1
## GSM1354976 Mock 1 hours biorep A RNA Mock 1 hours biorep A
## GSM1354977 Mock 2 hours biorep A RNA Mock 2 hours biorep A
## GSM1354978 Mock 3 hours biorep A RNA Mock 3 hours biorep A
## GSM1354979 Mock 4 hours biorep A RNA Mock 4 hours biorep A
## GSM1354980 Mock 5 hours biorep A RNA Mock 5 hours biorep A
## GSM1354981 Mock 6 hours biorep A RNA Mock 6 hours biorep A
myP = myP[c(1:(13*4),((13*4*2+1):(13*4*3))),]
str(myP$source_name_ch1)
## Factor w/ 156 levels "Mock 1 hours biorep A",..: 1 21 25 29 33 37 41 45 49 5 ...
## - attr(*, "names")= chr [1:104] "V2" "V3" "V4" "V5" ...
all(rownames(myP)==colnames(myE[,1:nold]))
## [1] TRUE
# phenoData <- new("AnnotatedDataFrame",data=myP)
summary(myP)
## title type source_name_ch1
## Mock 1 hours biorep A : 1 RNA:104 Mock 1 hours biorep A : 1
## Mock 1 hours biorep B : 1 Mock 1 hours biorep B : 1
## Mock 1 hours biorep C : 1 Mock 1 hours biorep C : 1
## Mock 1 hours biorep D : 1 Mock 1 hours biorep D : 1
## Mock 10 hours biorep A: 1 Mock 10 hours biorep A: 1
## Mock 10 hours biorep B: 1 Mock 10 hours biorep B: 1
## (Other) :98 (Other) :98
names(myP)
## [1] "title" "type" "source_name_ch1"
sapply(myP, class)
## title type source_name_ch1
## "factor" "factor" "factor"
Design (or model) matrix
(lev <- unique(ldply(strsplit(gsub(" ", "_", as.character(myP$source_name_ch1),
fixed = TRUE),
split = "_hours_biorep"))[[1]]))
## [1] "Mock_1"
## [2] "Mock_2"
## [3] "Mock_3"
## [4] "Mock_4"
## [5] "Mock_5"
## [6] "Mock_6"
## [7] "Mock_7"
## [8] "Mock_8"
## [9] "Mock_9"
## [10] "Mock_10"
## [11] "Mock_11"
## [12] "Mock_12"
## [13] "Mock_13"
## [14] "P._syringae_pv_tomato_DC3000_wildtype_1"
## [15] "P._syringae_pv_tomato_DC3000_wildtype_2"
## [16] "P._syringae_pv_tomato_DC3000_wildtype_3"
## [17] "P._syringae_pv_tomato_DC3000_wildtype_4"
## [18] "P._syringae_pv_tomato_DC3000_wildtype_5"
## [19] "P._syringae_pv_tomato_DC3000_wildtype_6"
## [20] "P._syringae_pv_tomato_DC3000_wildtype_7"
## [21] "P._syringae_pv_tomato_DC3000_wildtype_8"
## [22] "P._syringae_pv_tomato_DC3000_wildtype_9"
## [23] "P._syringae_pv_tomato_DC3000_wildtype_10"
## [24] "P._syringae_pv_tomato_DC3000_wildtype_11"
## [25] "P._syringae_pv_tomato_DC3000_wildtype_12"
## [26] "P._syringae_pv_tomato_DC3000_wildtype_13"
myP$source_name_ch1 = (ldply(strsplit(gsub(" ", "_", as.character(myP$source_name_ch1),
fixed = TRUE),
split = "_hours_biorep"))[[1]])
(f <- factor(myP$source_name_ch1, levels=lev))
## [1] Mock_1
## [2] Mock_2
## [3] Mock_3
## [4] Mock_4
## [5] Mock_5
## [6] Mock_6
## [7] Mock_7
## [8] Mock_8
## [9] Mock_9
## [10] Mock_10
## [11] Mock_11
## [12] Mock_12
## [13] Mock_13
## [14] Mock_1
## [15] Mock_2
## [16] Mock_3
## [17] Mock_4
## [18] Mock_5
## [19] Mock_6
## [20] Mock_7
## [21] Mock_8
## [22] Mock_9
## [23] Mock_10
## [24] Mock_11
## [25] Mock_12
## [26] Mock_13
## [27] Mock_1
## [28] Mock_2
## [29] Mock_3
## [30] Mock_4
## [31] Mock_5
## [32] Mock_6
## [33] Mock_7
## [34] Mock_8
## [35] Mock_9
## [36] Mock_10
## [37] Mock_11
## [38] Mock_12
## [39] Mock_13
## [40] Mock_1
## [41] Mock_2
## [42] Mock_3
## [43] Mock_4
## [44] Mock_5
## [45] Mock_6
## [46] Mock_7
## [47] Mock_8
## [48] Mock_9
## [49] Mock_10
## [50] Mock_11
## [51] Mock_12
## [52] Mock_13
## [53] P._syringae_pv_tomato_DC3000_wildtype_1
## [54] P._syringae_pv_tomato_DC3000_wildtype_2
## [55] P._syringae_pv_tomato_DC3000_wildtype_3
## [56] P._syringae_pv_tomato_DC3000_wildtype_4
## [57] P._syringae_pv_tomato_DC3000_wildtype_5
## [58] P._syringae_pv_tomato_DC3000_wildtype_6
## [59] P._syringae_pv_tomato_DC3000_wildtype_7
## [60] P._syringae_pv_tomato_DC3000_wildtype_8
## [61] P._syringae_pv_tomato_DC3000_wildtype_9
## [62] P._syringae_pv_tomato_DC3000_wildtype_10
## [63] P._syringae_pv_tomato_DC3000_wildtype_11
## [64] P._syringae_pv_tomato_DC3000_wildtype_12
## [65] P._syringae_pv_tomato_DC3000_wildtype_13
## [66] P._syringae_pv_tomato_DC3000_wildtype_1
## [67] P._syringae_pv_tomato_DC3000_wildtype_2
## [68] P._syringae_pv_tomato_DC3000_wildtype_3
## [69] P._syringae_pv_tomato_DC3000_wildtype_4
## [70] P._syringae_pv_tomato_DC3000_wildtype_5
## [71] P._syringae_pv_tomato_DC3000_wildtype_6
## [72] P._syringae_pv_tomato_DC3000_wildtype_7
## [73] P._syringae_pv_tomato_DC3000_wildtype_8
## [74] P._syringae_pv_tomato_DC3000_wildtype_9
## [75] P._syringae_pv_tomato_DC3000_wildtype_10
## [76] P._syringae_pv_tomato_DC3000_wildtype_11
## [77] P._syringae_pv_tomato_DC3000_wildtype_12
## [78] P._syringae_pv_tomato_DC3000_wildtype_13
## [79] P._syringae_pv_tomato_DC3000_wildtype_1
## [80] P._syringae_pv_tomato_DC3000_wildtype_2
## [81] P._syringae_pv_tomato_DC3000_wildtype_3
## [82] P._syringae_pv_tomato_DC3000_wildtype_4
## [83] P._syringae_pv_tomato_DC3000_wildtype_5
## [84] P._syringae_pv_tomato_DC3000_wildtype_6
## [85] P._syringae_pv_tomato_DC3000_wildtype_7
## [86] P._syringae_pv_tomato_DC3000_wildtype_8
## [87] P._syringae_pv_tomato_DC3000_wildtype_9
## [88] P._syringae_pv_tomato_DC3000_wildtype_10
## [89] P._syringae_pv_tomato_DC3000_wildtype_11
## [90] P._syringae_pv_tomato_DC3000_wildtype_12
## [91] P._syringae_pv_tomato_DC3000_wildtype_13
## [92] P._syringae_pv_tomato_DC3000_wildtype_1
## [93] P._syringae_pv_tomato_DC3000_wildtype_2
## [94] P._syringae_pv_tomato_DC3000_wildtype_3
## [95] P._syringae_pv_tomato_DC3000_wildtype_4
## [96] P._syringae_pv_tomato_DC3000_wildtype_5
## [97] P._syringae_pv_tomato_DC3000_wildtype_6
## [98] P._syringae_pv_tomato_DC3000_wildtype_7
## [99] P._syringae_pv_tomato_DC3000_wildtype_8
## [100] P._syringae_pv_tomato_DC3000_wildtype_9
## [101] P._syringae_pv_tomato_DC3000_wildtype_10
## [102] P._syringae_pv_tomato_DC3000_wildtype_11
## [103] P._syringae_pv_tomato_DC3000_wildtype_12
## [104] P._syringae_pv_tomato_DC3000_wildtype_13
## 26 Levels: Mock_1 Mock_2 Mock_3 Mock_4 Mock_5 Mock_6 Mock_7 ... P._syringae_pv_tomato_DC3000_wildtype_13
design <- model.matrix(~0+f)
dim(design)
## [1] 104 26
which(design!=0,arr.ind = T)
## row col
## 1 1 1
## 14 14 1
## 27 27 1
## 40 40 1
## 2 2 2
## 15 15 2
## 28 28 2
## 41 41 2
## 3 3 3
## 16 16 3
## 29 29 3
## 42 42 3
## 4 4 4
## 17 17 4
## 30 30 4
## 43 43 4
## 5 5 5
## 18 18 5
## 31 31 5
## 44 44 5
## 6 6 6
## 19 19 6
## 32 32 6
## 45 45 6
## 7 7 7
## 20 20 7
## 33 33 7
## 46 46 7
## 8 8 8
## 21 21 8
## 34 34 8
## 47 47 8
## 9 9 9
## 22 22 9
## 35 35 9
## 48 48 9
## 10 10 10
## 23 23 10
## 36 36 10
## 49 49 10
## 11 11 11
## 24 24 11
## 37 37 11
## 50 50 11
## 12 12 12
## 25 25 12
## 38 38 12
## 51 51 12
## 13 13 13
## 26 26 13
## 39 39 13
## 52 52 13
## 53 53 14
## 66 66 14
## 79 79 14
## 92 92 14
## 54 54 15
## 67 67 15
## 80 80 15
## 93 93 15
## 55 55 16
## 68 68 16
## 81 81 16
## 94 94 16
## 56 56 17
## 69 69 17
## 82 82 17
## 95 95 17
## 57 57 18
## 70 70 18
## 83 83 18
## 96 96 18
## 58 58 19
## 71 71 19
## 84 84 19
## 97 97 19
## 59 59 20
## 72 72 20
## 85 85 20
## 98 98 20
## 60 60 21
## 73 73 21
## 86 86 21
## 99 99 21
## 61 61 22
## 74 74 22
## 87 87 22
## 100 100 22
## 62 62 23
## 75 75 23
## 88 88 23
## 101 101 23
## 63 63 24
## 76 76 24
## 89 89 24
## 102 102 24
## 64 64 25
## 77 77 25
## 90 90 25
## 103 103 25
## 65 65 26
## 78 78 26
## 91 91 26
## 104 104 26
colnames(design) <- lev
rownames(design) = rownames(myP)
str(design)
## num [1:104, 1:26] 1 0 0 0 0 0 0 0 0 0 ...
## - attr(*, "dimnames")=List of 2
## ..$ : chr [1:104] "GSM1354976" "GSM1354977" "GSM1354978" "GSM1354979" ...
## ..$ : chr [1:26] "Mock_1" "Mock_2" "Mock_3" "Mock_4" ...
## - attr(*, "assign")= int [1:26] 1 1 1 1 1 1 1 1 1 1 ...
## - attr(*, "contrasts")=List of 1
## ..$ f: chr "contr.treatment"
Fit linear model for each gene given a series of arrays
fit <- lmFit(myE, design, weights=NULL)
summary(fit)
## Length Class Mode
## coefficients 623324 -none- numeric
## rank 1 -none- numeric
## assign 26 -none- numeric
## qr 5 qr list
## df.residual 23974 -none- numeric
## sigma 23974 -none- numeric
## cov.coefficients 676 -none- numeric
## stdev.unscaled 623324 -none- numeric
## pivot 26 -none- numeric
## Amean 23974 -none- numeric
## method 1 -none- character
## design 2704 -none- numeric
colnames(fit$design)
## [1] "Mock_1"
## [2] "Mock_2"
## [3] "Mock_3"
## [4] "Mock_4"
## [5] "Mock_5"
## [6] "Mock_6"
## [7] "Mock_7"
## [8] "Mock_8"
## [9] "Mock_9"
## [10] "Mock_10"
## [11] "Mock_11"
## [12] "Mock_12"
## [13] "Mock_13"
## [14] "P._syringae_pv_tomato_DC3000_wildtype_1"
## [15] "P._syringae_pv_tomato_DC3000_wildtype_2"
## [16] "P._syringae_pv_tomato_DC3000_wildtype_3"
## [17] "P._syringae_pv_tomato_DC3000_wildtype_4"
## [18] "P._syringae_pv_tomato_DC3000_wildtype_5"
## [19] "P._syringae_pv_tomato_DC3000_wildtype_6"
## [20] "P._syringae_pv_tomato_DC3000_wildtype_7"
## [21] "P._syringae_pv_tomato_DC3000_wildtype_8"
## [22] "P._syringae_pv_tomato_DC3000_wildtype_9"
## [23] "P._syringae_pv_tomato_DC3000_wildtype_10"
## [24] "P._syringae_pv_tomato_DC3000_wildtype_11"
## [25] "P._syringae_pv_tomato_DC3000_wildtype_12"
## [26] "P._syringae_pv_tomato_DC3000_wildtype_13"
#colnames(fit65046.comp$contrasts)
#which(fit65046.comp$contrasts!=0,arr.ind = T)
Plot few pairs
pairs(myE[1:1000,c(1,14,27,40)]-myE[1:1000,c(1,14,27,40)+52])
Construct the contrast matrix corresponding to specified contrasts of a set of parameters.
# print(eval(parse(text = paste0('gse$',i,'[[1]]'))))
# http://www.bioconductor.org/packages/release/bioc/vignettes/limma/inst/doc/usersguide.pdf
contrast.matrix = makeContrasts(
eval(parse(text = colnames(fit$design)[length(colnames(fit$design))/2 + 1])) -
eval(parse(text = colnames(fit$design)[1])),
eval(parse(text = colnames(fit$design)[length(colnames(fit$design))/2 + 2])) -
eval(parse(text = colnames(fit$design)[2])),
eval(parse(text = colnames(fit$design)[length(colnames(fit$design))/2 + 3])) -
eval(parse(text = colnames(fit$design)[3])),
eval(parse(text = colnames(fit$design)[length(colnames(fit$design))/2 + 4])) -
eval(parse(text = colnames(fit$design)[4])),
eval(parse(text = colnames(fit$design)[length(colnames(fit$design))/2 + 5])) -
eval(parse(text = colnames(fit$design)[5])),
eval(parse(text = colnames(fit$design)[length(colnames(fit$design))/2 + 6])) -
eval(parse(text = colnames(fit$design)[6])),
eval(parse(text = colnames(fit$design)[length(colnames(fit$design))/2 + 7])) -
eval(parse(text = colnames(fit$design)[7])),
eval(parse(text = colnames(fit$design)[length(colnames(fit$design))/2 + 8])) -
eval(parse(text = colnames(fit$design)[8])),
eval(parse(text = colnames(fit$design)[length(colnames(fit$design))/2 + 9])) -
eval(parse(text = colnames(fit$design)[9])),
eval(parse(text = colnames(fit$design)[length(colnames(fit$design))/2 + 10])) -
eval(parse(text = colnames(fit$design)[10])),
eval(parse(text = colnames(fit$design)[length(colnames(fit$design))/2 + 11])) -
eval(parse(text = colnames(fit$design)[11])),
eval(parse(text = colnames(fit$design)[length(colnames(fit$design))/2 + 12])) -
eval(parse(text = colnames(fit$design)[12])),
eval(parse(text = colnames(fit$design)[length(colnames(fit$design))/2 + 13])) -
eval(parse(text = colnames(fit$design)[13])),
levels=design)
# same as
# contrast.matrix.b = makeContrasts(P._syringae_pv_tomato_DC3000_wildtype_1 - Mock_1,
# P._syringae_pv_tomato_DC3000_wildtype_2 - Mock_2,
# P._syringae_pv_tomato_DC3000_wildtype_3 - Mock_3,
# P._syringae_pv_tomato_DC3000_wildtype_4 - Mock_4,
# P._syringae_pv_tomato_DC3000_wildtype_5 - Mock_5,
# P._syringae_pv_tomato_DC3000_wildtype_6 - Mock_6,
# P._syringae_pv_tomato_DC3000_wildtype_7 - Mock_7,
# P._syringae_pv_tomato_DC3000_wildtype_8 - Mock_8,
# P._syringae_pv_tomato_DC3000_wildtype_9 - Mock_9,
# P._syringae_pv_tomato_DC3000_wildtype_10 - Mock_10,
# P._syringae_pv_tomato_DC3000_wildtype_11 - Mock_11,
# P._syringae_pv_tomato_DC3000_wildtype_12 - Mock_12,
# P._syringae_pv_tomato_DC3000_wildtype_13 - Mock_13,
# levels=design)
# any(contrast.matrix.b != contrast.matrix)
Compute Contrasts from Linear Model Fit
fit.comp <- contrasts.fit(fit, contrast.matrix)
Empirical Bayes Statistics for Differential Expression
fit.comp <- eBayes(fit.comp)
Examine unadjusted p-values
plot(density(fit.comp$p.value))
Plot Venn diagrams per tp
# par(mfrow=c(2,2))
c_1_5 <- decideTests(fit.comp[, 1:5], adjust='BH', p.value=0.05)
vennDiagram(c_1_5, include=c("up", "down"), counts.col=c("red", "blue"),
cex=c(0.8, 0.8, 0.8), circle.col=rainbow(5), show.include = TRUE,
main = "Differences 1-5",
names = c ("t1-t1", "t2-t2", "t3-t3", "t4-t4", "t5-t5"))
c_6_10 <- decideTests(fit.comp[, 6:10], adjust='BH', p.value=0.05)
vennDiagram(c_6_10, include=c("up", "down"), counts.col=c("red", "blue"),
cex=c(0.8, 0.8, 0.8), circle.col=rainbow(5), show.include = TRUE,
main = "Differences 6-10",
names = c ("t6-t6", "t7-t7", "t8-t8", "t9-t9", "t10-t10"))
c_11_13 <- decideTests(fit.comp[, 11:13], adjust='BH', p.value=0.05)
vennDiagram(c_11_13, include=c("up", "down"), counts.col=c("red", "blue"),
cex=c(0.8, 0.8, 0.8), circle.col=rainbow(5), show.include = TRUE,
main = "Differences 11-13",
names = c ("t11-t11", "t12-t12", "t13-t13"))
# par(mfrow=c(1,1))
Up/down-regulated genes in infected leaves
temp = decideTests(fit.comp, adjust='fdr', p.value=0.05)
colnames(temp) = paste0('tp_',seq(1,13,1))
summary(temp)
## tp_1 tp_2 tp_3 tp_4 tp_5 tp_6 tp_7 tp_8 tp_9 tp_10 tp_11 tp_12
## -1 2 1211 588 2573 1982 3163 2950 4273 4348 5251 4883 4770
## 0 23972 21297 22461 18656 19997 17743 18217 15433 15002 13029 14043 14218
## 1 0 1466 925 2745 1995 3068 2807 4268 4624 5694 5048 4986
## tp_13
## -1 5432
## 0 13068
## 1 5474
Create output directory
mainDir <- getwd()
subDir <- "outputDirectory_GSE56094"
if (file.exists(subDir)){
setwd(file.path(mainDir, subDir))
} else {
dir.create(file.path(mainDir, subDir))
setwd(file.path(mainDir, subDir))
}
Write results
tmp = getwd()
i = 1
comparisons <- topTable(fit.comp, coef=i, adjust='fdr', sort='p', n=Inf)
colnames(comparisons) = paste0(colnames(comparisons), '_tp_00', i)
head(comparisons)
## logFC_tp_001 AveExpr_tp_001 t_tp_001 P.Value_tp_001
## AT5G62280 -1.1174457 9.307739 -6.396803 9.079465e-09
## AT4G32280 -0.8364199 8.454607 -5.069194 2.412759e-06
## AT5G66580 -0.7290080 9.549609 -4.759518 8.182172e-06
## AT4G22470 1.8837634 14.546944 4.388996 3.350426e-05
## AT5G18060 -0.6052422 8.600483 -4.292388 4.790469e-05
## AT4G34250 -0.6732834 8.489078 -4.245183 5.695893e-05
## adj.P.Val_tp_001 B_tp_001
## AT5G62280 0.0002176711 1.1211599
## AT4G32280 0.0289217474 -0.6615033
## AT5G66580 0.0653864669 -1.0691053
## AT4G22470 0.2008077617 -1.5466730
## AT5G18060 0.2275889119 -1.6689062
## AT4G34250 0.2275889119 -1.7282407
write.table(cbind(rownames(comparisons), comparisons),
sep="\t", quote=F, row.names = FALSE,
file=paste0(tmp, "/DE_GSE56094_tp_", i, ".tsv"))
for (i in 2:dim(fit.comp$contrasts)[2]){
comparison <- topTable(fit.comp, coef=i, adjust='fdr', sort='p', n=Inf)
colnames(comparison) = paste0(colnames(comparison), '_tp_00', i)
write.table(cbind(rownames(comparison), comparison),
sep="\t", quote=F, row.names = FALSE,
file=paste0(tmp, "/DE_GSE56094_tp_", i, ".tsv"))
comparisons = cbind(comparisons, comparison)
}
write.table(cbind(rownames(comparisons), comparisons),
sep="\t", quote=F, row.names = FALSE,
file=paste0(tmp, "/DE_GSE56094_comparisons.tsv"))
setwd(mainDir)
save.image(file="GSE56094.RData")
Article results: http://www.plantcell.org/content/suppl/2015/10/23/tpc.15.00471.DC1/TPC2015-00471-LSBR1_Supplemental_Dataset_1.xlsx
Supplemental Data. Lewis et al. (2015). Plant Cell 10.1105/tpc15.00471.
Supplemental Data Set 1a. LIMMA 3D trend data: inoculation with virulent DC3000 versus MgCl (mock control).
Up Regulated
| Time points: | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Total ids: | 0 | 1712 | 991 | 3109 | 2309 | 3592 | 3210 | 5053 | 5405 | 6830 | 6007 | 5940 | 6539 |
Down Regulated
| Time points: | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Total ids: | 3 | 1331 | 628 | 2897 | 2198 | 3631 | 3391 | 4965 | 5263 | 6175 | 5712 | 5588 | 6379 |