ExpressionSet
objects and SummarizedExperiment
objects are data structures that store these 3 tables all in one place. This ensures consistency.exprs()
to extract the gene expression matrix. The rows in the matrix represent genes and the columns are the samplespData()
returns phenotype data i.e. the sample description table . You can extract each column in the sample description table using the usual $
that you would use to subset a data framefData()
stands for feature data and returns the annotation of the genes in the gene expression matrix?ExpressionSet
if you want to create your own ExpressionSet objectlibrary(Biobase)
## Loading required package: BiocGenerics
## Loading required package: methods
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
##
## IQR, mad, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, cbind, colnames,
## do.call, duplicated, eval, evalq, Filter, Find, get, grep,
## grepl, intersect, is.unsorted, lapply, lengths, Map, mapply,
## match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
## Position, rank, rbind, Reduce, rownames, sapply, setdiff,
## sort, table, tapply, union, unique, unsplit, which, which.max,
## which.min
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
library(ALL)
library(hgu95av2.db) # probe annotation for the Affymatrix chip
## Loading required package: AnnotationDbi
## Loading required package: stats4
## Loading required package: IRanges
## Loading required package: S4Vectors
##
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:base':
##
## colMeans, colSums, expand.grid, rowMeans, rowSums
## Loading required package: org.Hs.eg.db
##
##
# or install the packages if you already haven't
# source("http://www.bioconductor.org/biocLite.R")
# biocLite(c("Biobase", "ALL", "hgu95av2.db"))
data(ALL)
ALL
## ExpressionSet (storageMode: lockedEnvironment)
## assayData: 12625 features, 128 samples
## element names: exprs
## protocolData: none
## phenoData
## sampleNames: 01005 01010 ... LAL4 (128 total)
## varLabels: cod diagnosis ... date last seen (21 total)
## varMetadata: labelDescription
## featureData: none
## experimentData: use 'experimentData(object)'
## pubMedIds: 14684422 16243790
## Annotation: hgu95av2
# help documentation for ALL data
# ?ALL
experimentData(ALL)
## Experiment data
## Experimenter name: Chiaretti et al.
## Laboratory: Department of Medical Oncology, Dana-Farber Cancer Institute, Department of Medicine, Brigham and Women's Hospital, Harvard Medical School, Boston, MA 02115, USA.
## Contact information:
## Title: Gene expression profile of adult T-cell acute lymphocytic leukemia identifies distinct subsets of patients with different response to therapy and survival.
## URL:
## PMIDs: 14684422 16243790
##
## Abstract: A 187 word abstract is available. Use 'abstract' method.
# extract/explore the gene expression matrix
exprs(ALL)[1:4, 1:4]
## 01005 01010 03002 04006
## 1000_at 7.597323 7.479445 7.567593 7.384684
## 1001_at 5.046194 4.932537 4.799294 4.922627
## 1002_f_at 3.900466 4.208155 3.886169 4.206798
## 1003_s_at 5.903856 6.169024 5.860459 6.116890
head(sampleNames(ALL))
## [1] "01005" "01010" "03002" "04006" "04007" "04008"
head(featureNames(ALL))
## [1] "1000_at" "1001_at" "1002_f_at" "1003_s_at" "1004_at" "1005_at"
# get the phenotype/sample data table
head(pData(ALL))
## cod diagnosis sex age BT remission CR date.cr t(4;11) t(9;22)
## 01005 1005 5/21/1997 M 53 B2 CR CR 8/6/1997 FALSE TRUE
## 01010 1010 3/29/2000 M 19 B2 CR CR 6/27/2000 FALSE FALSE
## 03002 3002 6/24/1998 F 52 B4 CR CR 8/17/1998 NA NA
## 04006 4006 7/17/1997 M 38 B1 CR CR 9/8/1997 TRUE FALSE
## 04007 4007 7/22/1997 M 57 B2 CR CR 9/17/1997 FALSE FALSE
## 04008 4008 7/30/1997 M 17 B1 CR CR 9/27/1997 FALSE FALSE
## cyto.normal citog mol.biol fusion protein mdr kinet ccr
## 01005 FALSE t(9;22) BCR/ABL p210 NEG dyploid FALSE
## 01010 FALSE simple alt. NEG <NA> POS dyploid FALSE
## 03002 NA <NA> BCR/ABL p190 NEG dyploid FALSE
## 04006 FALSE t(4;11) ALL1/AF4 <NA> NEG dyploid FALSE
## 04007 FALSE del(6q) NEG <NA> NEG dyploid FALSE
## 04008 FALSE complex alt. NEG <NA> NEG hyperd. FALSE
## relapse transplant f.u date last seen
## 01005 FALSE TRUE BMT / DEATH IN CR <NA>
## 01010 TRUE FALSE REL 8/28/2000
## 03002 TRUE FALSE REL 10/15/1999
## 04006 TRUE FALSE REL 1/23/1998
## 04007 TRUE FALSE REL 11/4/1997
## 04008 TRUE FALSE REL 12/15/1997
head(pData(ALL)$sex)
## [1] M M F M M M
## Levels: F M
head(ALL$sex)
## [1] M M F M M M
## Levels: F M
# subsetting
ALL[,1:5]
## ExpressionSet (storageMode: lockedEnvironment)
## assayData: 12625 features, 5 samples
## element names: exprs
## protocolData: none
## phenoData
## sampleNames: 01005 01010 ... 04007 (5 total)
## varLabels: cod diagnosis ... date last seen (21 total)
## varMetadata: labelDescription
## featureData: none
## experimentData: use 'experimentData(object)'
## pubMedIds: 14684422 16243790
## Annotation: hgu95av2
ALL[1:10,]
## ExpressionSet (storageMode: lockedEnvironment)
## assayData: 10 features, 128 samples
## element names: exprs
## protocolData: none
## phenoData
## sampleNames: 01005 01010 ... LAL4 (128 total)
## varLabels: cod diagnosis ... date last seen (21 total)
## varMetadata: labelDescription
## featureData: none
## experimentData: use 'experimentData(object)'
## pubMedIds: 14684422 16243790
## Annotation: hgu95av2
ALL[, c(3,2,1)]
## ExpressionSet (storageMode: lockedEnvironment)
## assayData: 12625 features, 3 samples
## element names: exprs
## protocolData: none
## phenoData
## sampleNames: 03002 01010 01005
## varLabels: cod diagnosis ... date last seen (21 total)
## varMetadata: labelDescription
## featureData: none
## experimentData: use 'experimentData(object)'
## pubMedIds: 14684422 16243790
## Annotation: hgu95av2
ALL$sex[c(1,2,3)]
## [1] M M F
## Levels: F M
# the gene annotation data
head(fData(ALL))
## data frame with 0 columns and 6 rows
# find the Entrez IDs for the probes in the chip
library(hgu95av2.db)
ids <- featureNames(ALL)[1:5]
ids
## [1] "1000_at" "1001_at" "1002_f_at" "1003_s_at" "1004_at"
as.list(hgu95av2ENTREZID[ids])
## $`1000_at`
## [1] "5595"
##
## $`1001_at`
## [1] "7075"
##
## $`1002_f_at`
## [1] "1557"
##
## $`1003_s_at`
## [1] "643"
##
## $`1004_at`
## [1] "643"
assays()
returns the count matrix. Rows are genes, columns are samplescolData()
to get the sample descriptions and rowData()
to retrieve gene annotation$
that you would use to subset a data frameassayNames()
to see the names of count matrices stored in the object.rowRanges()
. This returns a GRangesList
or a GRanges
object?SummarizedExperiment
if you want to create your own SummarizedExperiment objectlibrary(SummarizedExperiment)
## Loading required package: GenomicRanges
## Loading required package: GenomeInfoDb
library(GenomicRanges)
library(airway)
# or install using biocLite()
# source("http://www.bioconductor.org/biocLite.R")
# biocLite(c("SummarizedExperiment", "GenomicRanges", "airway"))
data(airway)
airway
## class: RangedSummarizedExperiment
## dim: 64102 8
## metadata(1): ''
## assays(1): counts
## rownames(64102): ENSG00000000003 ENSG00000000005 ... LRG_98 LRG_99
## rowData names(0):
## colnames(8): SRR1039508 SRR1039509 ... SRR1039520 SRR1039521
## colData names(9): SampleName cell ... Sample BioSample
# what assays/count matrices are available
assayNames(airway)
## [1] "counts"
# extract the count data
head(assay(airway, "counts"))
## SRR1039508 SRR1039509 SRR1039512 SRR1039513 SRR1039516
## ENSG00000000003 679 448 873 408 1138
## ENSG00000000005 0 0 0 0 0
## ENSG00000000419 467 515 621 365 587
## ENSG00000000457 260 211 263 164 245
## ENSG00000000460 60 55 40 35 78
## ENSG00000000938 0 0 2 0 1
## SRR1039517 SRR1039520 SRR1039521
## ENSG00000000003 1047 770 572
## ENSG00000000005 0 0 0
## ENSG00000000419 799 417 508
## ENSG00000000457 331 233 229
## ENSG00000000460 63 76 60
## ENSG00000000938 0 0 0
head(assay(airway))
## SRR1039508 SRR1039509 SRR1039512 SRR1039513 SRR1039516
## ENSG00000000003 679 448 873 408 1138
## ENSG00000000005 0 0 0 0 0
## ENSG00000000419 467 515 621 365 587
## ENSG00000000457 260 211 263 164 245
## ENSG00000000460 60 55 40 35 78
## ENSG00000000938 0 0 2 0 1
## SRR1039517 SRR1039520 SRR1039521
## ENSG00000000003 1047 770 572
## ENSG00000000005 0 0 0
## ENSG00000000419 799 417 508
## ENSG00000000457 331 233 229
## ENSG00000000460 63 76 60
## ENSG00000000938 0 0 0
dim(airway)
## [1] 64102 8
# the sample information table
head(colData(airway))
## DataFrame with 6 rows and 9 columns
## SampleName cell dex albut Run avgLength
## <factor> <factor> <factor> <factor> <factor> <integer>
## SRR1039508 GSM1275862 N61311 untrt untrt SRR1039508 126
## SRR1039509 GSM1275863 N61311 trt untrt SRR1039509 126
## SRR1039512 GSM1275866 N052611 untrt untrt SRR1039512 126
## SRR1039513 GSM1275867 N052611 trt untrt SRR1039513 87
## SRR1039516 GSM1275870 N080611 untrt untrt SRR1039516 120
## SRR1039517 GSM1275871 N080611 trt untrt SRR1039517 126
## Experiment Sample BioSample
## <factor> <factor> <factor>
## SRR1039508 SRX384345 SRS508568 SAMN02422669
## SRR1039509 SRX384346 SRS508567 SAMN02422675
## SRR1039512 SRX384349 SRS508571 SAMN02422678
## SRR1039513 SRX384350 SRS508572 SAMN02422670
## SRR1039516 SRX384353 SRS508575 SAMN02422682
## SRR1039517 SRX384354 SRS508576 SAMN02422673
# use $ to get a specific column
head(airway$cell)
## [1] N61311 N61311 N052611 N052611 N080611 N080611
## Levels: N052611 N061011 N080611 N61311
head(colnames(airway))
## [1] "SRR1039508" "SRR1039509" "SRR1039512" "SRR1039513" "SRR1039516"
## [6] "SRR1039517"
head(rownames(airway))
## [1] "ENSG00000000003" "ENSG00000000005" "ENSG00000000419" "ENSG00000000457"
## [5] "ENSG00000000460" "ENSG00000000938"
# get the genomic coordinates of the features
rowRanges(airway)
## GRangesList object of length 64102:
## $ENSG00000000003
## GRanges object with 17 ranges and 2 metadata columns:
## seqnames ranges strand | exon_id exon_name
## <Rle> <IRanges> <Rle> | <integer> <character>
## [1] X [99883667, 99884983] - | 667145 ENSE00001459322
## [2] X [99885756, 99885863] - | 667146 ENSE00000868868
## [3] X [99887482, 99887565] - | 667147 ENSE00000401072
## [4] X [99887538, 99887565] - | 667148 ENSE00001849132
## [5] X [99888402, 99888536] - | 667149 ENSE00003554016
## ... ... ... ... . ... ...
## [13] X [99890555, 99890743] - | 667156 ENSE00003512331
## [14] X [99891188, 99891686] - | 667158 ENSE00001886883
## [15] X [99891605, 99891803] - | 667159 ENSE00001855382
## [16] X [99891790, 99892101] - | 667160 ENSE00001863395
## [17] X [99894942, 99894988] - | 667161 ENSE00001828996
##
## ...
## <64101 more elements>
## -------
## seqinfo: 722 sequences (1 circular) from an unspecified genome
3 types of annotation packages:
BSgenome
packages that are named in the form of BSgenome.organism.provider.genomeVersion that contain the actual DNA sequence e.g. BSgenome.Hsapiens.UCSC.hg38
TxDb
packages that contain gene models and are named in the form of TxDb.organism.provider.genomeVersion.type e.g. TxDb.Dmelanogaster.UCSC.dm6.ensGene
orgDb
packages that provide different gene annotations for any given species such as org.Mm.eg.db
. These are the most useful packages when one is interested in ID conversion i.e. ENSEMBL to ENTREZ Ids. These packages start with org.In this section we explore the orgDb packages.
org.xx.xx.db
objectcolumns()
to see what are the available columns to select from. Each column is an annotation database here that we can make query onkeys()
to see what keys are available to make yor query on (i.e the values that we can use to make queries on the the available column)select()
to extract the information in a column or multiple columns usings the values (i.e keys) that you have.Lets apply these functions to convert the Ensembl ids in the airway
data to their corresponding EntrezIDs. To do this, we first need to load the org.Hs.eg.db
package and explore the information that it contains.
# biocInstalled::biocLite("org.Hs.eg.db")
library(org.Hs.eg.db)
# values that you can use to make queries on the org.Hs.eg.db object
keys <- keytypes(org.Hs.eg.db)
head(keys, n = 30)
## [1] "ACCNUM" "ALIAS" "ENSEMBL" "ENSEMBLPROT"
## [5] "ENSEMBLTRANS" "ENTREZID" "ENZYME" "EVIDENCE"
## [9] "EVIDENCEALL" "GENENAME" "GO" "GOALL"
## [13] "IPI" "MAP" "OMIM" "ONTOLOGY"
## [17] "ONTOLOGYALL" "PATH" "PFAM" "PMID"
## [21] "PROSITE" "REFSEQ" "SYMBOL" "UCSCKG"
## [25] "UNIGENE" "UNIPROT"
# the available annotations
columns(org.Hs.eg.db)
## [1] "ACCNUM" "ALIAS" "ENSEMBL" "ENSEMBLPROT"
## [5] "ENSEMBLTRANS" "ENTREZID" "ENZYME" "EVIDENCE"
## [9] "EVIDENCEALL" "GENENAME" "GO" "GOALL"
## [13] "IPI" "MAP" "OMIM" "ONTOLOGY"
## [17] "ONTOLOGYALL" "PATH" "PFAM" "PMID"
## [21] "PROSITE" "REFSEQ" "SYMBOL" "UCSCKG"
## [25] "UNIGENE" "UNIPROT"
Lets convert the Ensembl Ids in the airway data to their Entrez Ids …
head(rownames(airway))
## [1] "ENSG00000000003" "ENSG00000000005" "ENSG00000000419" "ENSG00000000457"
## [5] "ENSG00000000460" "ENSG00000000938"
ensembl_to_entrez <- select(org.Hs.eg.db, keys = as.character(rownames(airway)), columns= "ENTREZID", keytype="ENSEMBL")
## 'select()' returned 1:many mapping between keys and columns
head(ensembl_to_entrez)
## ENSEMBL ENTREZID
## 1 ENSG00000000003 7105
## 2 ENSG00000000005 64102
## 3 ENSG00000000419 8813
## 4 ENSG00000000457 57147
## 5 ENSG00000000460 55732
## 6 ENSG00000000938 2268
find the gene symbols and gene descriptions (names) for the airway data
Find the symbol of the gene with Ensembl ID “ENSG00000000005” and all the aliases associated with that genes. How many transcripts are recorded for this gene in ENSEMBL Transcript database?
GEOquery
package in BioconductorgetGEO()
to retrieve the main files and getGEOSuppFiles()
to retrieve the supplementary data# source("http://www.bioconductor.org/biocLite.R")
# biocLite(c("GEOquery"))
library(GEOquery)
## Setting options('download.file.method.GEOquery'='auto')
## Setting options('GEOquery.inmemory.gpl'=FALSE)
eList <- getGEO("GSE11675")
## https://ftp.ncbi.nlm.nih.gov/geo/series/GSE11nnn/GSE11675/matrix/
## OK
## Found 1 file(s)
## GSE11675_series_matrix.txt.gz
## File stored at:
## /var/folders/9v/cjl49srx09j1r1xjd5nwry5h0000gn/T//RtmpXCdkgG/GPL8300.soft
## Warning in read.table(file = file, header = header, sep = sep, quote =
## quote, : not all columns named in 'colClasses' exist
class(eList)
## [1] "list"
length(eList)
## [1] 1
names(eList)
## [1] "GSE11675_series_matrix.txt.gz"
eData <- eList[[1]]
eData
## ExpressionSet (storageMode: lockedEnvironment)
## assayData: 12625 features, 6 samples
## element names: exprs
## protocolData: none
## phenoData
## sampleNames: GSM296630 GSM296635 ... GSM296639 (6 total)
## varLabels: title geo_accession ... data_row_count (34 total)
## varMetadata: labelDescription
## featureData
## featureNames: 1000_at 1001_at ... AFFX-YEL024w/RIP1_at (12625
## total)
## fvarLabels: ID GB_ACC ... Gene Ontology Molecular Function (16
## total)
## fvarMetadata: Column Description labelDescription
## experimentData: use 'experimentData(object)'
## Annotation: GPL8300
names(pData(eData))
## [1] "title" "geo_accession"
## [3] "status" "submission_date"
## [5] "last_update_date" "type"
## [7] "channel_count" "source_name_ch1"
## [9] "organism_ch1" "characteristics_ch1"
## [11] "characteristics_ch1.1" "characteristics_ch1.2"
## [13] "molecule_ch1" "extract_protocol_ch1"
## [15] "label_ch1" "label_protocol_ch1"
## [17] "taxid_ch1" "hyb_protocol"
## [19] "scan_protocol" "description"
## [21] "data_processing" "platform_id"
## [23] "contact_name" "contact_email"
## [25] "contact_phone" "contact_department"
## [27] "contact_institute" "contact_address"
## [29] "contact_city" "contact_zip/postal_code"
## [31] "contact_country" "supplementary_file"
## [33] "supplementary_file.1" "data_row_count"
eList2 <- getGEOSuppFiles("GSE11675")
## https://ftp.ncbi.nlm.nih.gov/geo/series/GSE11nnn/GSE11675/suppl/
## OK
rownames(eList2) <- basename(rownames(eList2))
eList2
## size isdir mode mtime
## GSE11675_RAW.tar 45803520 FALSE 644 2017-05-18 16:58:09
## filelist.txt 740 FALSE 644 2017-05-18 16:58:11
## ctime atime uid gid uname
## GSE11675_RAW.tar 2017-05-18 16:58:09 2017-05-18 16:57:56 501 20 pfh
## filelist.txt 2017-05-18 16:58:11 2017-05-17 11:05:47 501 20 pfh
## grname
## GSE11675_RAW.tar staff
## filelist.txt staff
This is now a data.frame of file names. A single TAR archive was downloaded. You can expand the TAR achive using standard tools; inside there is a list of 6 CEL files and 6 CHP files. You can then read the 6 CEL files into R using functions from affy or oligo.
It is also possible to use GEOquery to query GEO as a database (ie. looking for datasets); more information in the package vignette.