# This will install the packages used in this file.
source("http://www.bioconductor.org/biocLite.R")
biocLite(c(
"GenomicRanges",
"Biostrings",
"BSgenome",
"BSgenome.Scerevisiae.UCSC.sacCer3",
"TxDb.Scerevisiae.UCSC.sacCer3.sgdGene",
"seqLogo"))
library(GenomicRanges)
library(Biostrings)
library(BSgenome)
Bioconductor packages usually have documentation in the form of “vignettes”. These are also available on the Bioconductor website: https://www.bioconductor.org/help/
vignette()
vignette(package="Biostrings")
vignette("BiostringsQuickOverview")
browseVignettes()
Integer ranges, 1-based, from start to end inclusive.
This is R, where everything is a vector, so there is no singular IRange, only plural IRanges.
myiranges <- IRanges(start=c(5,20,25), end=c(10,30,40))
Like all objects from Bioconductor, myiranges
is an “S4” object. It has a class, which can be obtained with class()
. This determines:
Data stored in slots is accessible with @
(similar to $
for lists), but code that uses accessor functions will be more generic and less liable to break in future.
myiranges
## IRanges object with 3 ranges and 0 metadata columns:
## start end width
## <integer> <integer> <integer>
## [1] 5 10 6
## [2] 20 30 11
## [3] 25 40 16
class(myiranges)
## [1] "IRanges"
## attr(,"package")
## [1] "IRanges"
methods(class="IRanges")
## [1] ! != [
## [4] [[ [[<- [<-
## [7] %in% < <=
## [10] == > >=
## [13] $ $<- aggregate
## [16] anyNA append as.character
## [19] as.complex as.data.frame as.env
## [22] as.integer as.list as.logical
## [25] as.matrix as.numeric as.raw
## [28] as.vector by c
## [31] cbind coerce cor
## [34] countOverlaps cov coverage
## [37] diff disjoin disjointBins
## [40] distance distanceToNearest do.call
## [43] drop droplevels duplicated
## [46] elementMetadata elementMetadata<- elementNROWS
## [49] elementType end end<-
## [52] endoapply eval expand
## [55] expand.grid export extractList
## [58] extractROWS Filter findOverlaps
## [61] flank follow gaps
## [64] getListElement head ifelse
## [67] intersect IQR is.na
## [70] is.unsorted isDisjoint isEmpty
## [73] isNormal lapply length
## [76] lengths mad match
## [79] mcols mcols<- mean
## [82] median mendoapply merge
## [85] metadata metadata<- mid
## [88] mstack names names<-
## [91] narrow nearest NROW
## [94] NSBS Ops order
## [97] overlapsAny parallelSlotNames parallelVectorNames
## [100] pcompare pcompareRecursively pgap
## [103] pintersect pmax pmax.int
## [106] pmin pmin.int poverlaps
## [109] precede promoters psetdiff
## [112] punion quantile range
## [115] rank rbind Reduce
## [118] reduce reflect relist
## [121] rename rep rep.int
## [124] replaceROWS resize restrict
## [127] rev revElements reverse
## [130] ROWNAMES sapply sd
## [133] selfmatch seqinfo seqinfo<-
## [136] seqlevelsInUse setdiff setequal
## [139] shift shiftApply show
## [142] showAsCell slidingWindows sort
## [145] split split<- stack
## [148] start start<- subset
## [151] subsetByOverlaps Summary table
## [154] tail tapply threebands
## [157] tile union unique
## [160] unlist unsplit update
## [163] values values<- var
## [166] which.max which.min whichFirstNotNormal
## [169] width width<- window
## [172] window<- with within
## [175] xtabs xtfrm zipdown
## see '?methods' for accessing help and source code
# ?"IRanges-class"
# Accessor functions
start(myiranges)
## [1] 5 20 25
#(or myiranges@start, but start(myiranges) is preferable)
end(myiranges)
## [1] 10 30 40
width(myiranges)
## [1] 6 11 16
# IRanges supports many operations
resize(myiranges, 3, fix="start")
## IRanges object with 3 ranges and 0 metadata columns:
## start end width
## <integer> <integer> <integer>
## [1] 5 7 3
## [2] 20 22 3
## [3] 25 27 3
resize(myiranges, 3, fix="end")
## IRanges object with 3 ranges and 0 metadata columns:
## start end width
## <integer> <integer> <integer>
## [1] 8 10 3
## [2] 28 30 3
## [3] 38 40 3
Ok, this gets confusing, let me illustrate.
show_iranges <- function(obj) {
for(i in seq_along(obj))
cat(rep(" ", start(obj)[i]),
rep("=", width(obj)[i]),
"\n", sep="")
}
show_iranges( myiranges )
## ======
## ===========
## ================
show_iranges( resize(myiranges, 3, fix="start") )
## ===
## ===
## ===
show_iranges( resize(myiranges, 3, fix="end") )
## ===
## ===
## ===
show_iranges( flank(myiranges, 2, start=TRUE) )
## ==
## ==
## ==
show_iranges( flank(myiranges, 2, start=FALSE) )
## ==
## ==
## ==
show_iranges( range(myiranges) )
## ====================================
show_iranges( reduce(myiranges) )
## ======
## =====================
show_iranges( disjoin(myiranges) )
## ======
## =====
## ======
## ==========
show_iranges( setdiff(range(myiranges), myiranges) )
## =========
show_iranges( intersect(myiranges[2], myiranges[3]) )
## ======
show_iranges( union(myiranges[2], myiranges[3]) )
## =====================
# ?"intra-range-methods"
# ?"inter-range-methods"
# ?"setops-methods"
# ?"findOverlaps-methods"
# ?"nearest-methods"
Note: Loading tidyverse packages may clobber some of these generic functions. If necessary use BiocGenerics::setdiff
, etc.
Without using the promoters()
function, how would you get IRanges from -5 to +2 of the starts of myiranges
?
To refer to a location in a genome we also need:
mygranges <- GRanges(
seqnames = c("chrII", "chrI", "chrI"),
ranges = IRanges(start=c(5,20,25), end=c(10,30,40)),
strand = c("+", "-", "+"))
mygranges
## GRanges object with 3 ranges and 0 metadata columns:
## seqnames ranges strand
## <Rle> <IRanges> <Rle>
## [1] chrII [ 5, 10] +
## [2] chrI [20, 30] -
## [3] chrI [25, 40] +
## -------
## seqinfo: 2 sequences from an unspecified genome; no seqlengths
class(mygranges)
## [1] "GRanges"
## attr(,"package")
## [1] "GenomicRanges"
methods(class="GRanges")
## [1] != [ [<-
## [4] %in% < <=
## [7] == > >=
## [10] $ $<- aggregate
## [13] anyNA append as.character
## [16] as.complex as.data.frame as.env
## [19] as.factor as.integer as.list
## [22] as.logical as.matrix as.numeric
## [25] as.raw blocks browseGenome
## [28] by c chrom
## [31] chrom<- coerce coerce<-
## [34] countOverlaps coverage disjoin
## [37] disjointBins distance distanceToNearest
## [40] duplicated elementMetadata elementMetadata<-
## [43] end end<- eval
## [46] expand expand.grid export
## [49] extractROWS findOverlaps flank
## [52] follow gaps granges
## [55] head intersect is.unsorted
## [58] isDisjoint length lengths
## [61] liftOver match mcols
## [64] mcols<- merge metadata
## [67] metadata<- mstack names
## [70] names<- narrow nearest
## [73] NROW Ops order
## [76] overlapsAny parallelSlotNames pcompare
## [79] pgap pintersect precede
## [82] promoters psetdiff punion
## [85] range ranges ranges<-
## [88] rank reduce relist
## [91] relistToClass rename rep
## [94] rep.int replaceROWS resize
## [97] restrict rev ROWNAMES
## [100] score score<- selfmatch
## [103] seqinfo seqinfo<- seqlevelsInUse
## [106] seqnames seqnames<- setdiff
## [109] setequal shift shiftApply
## [112] show showAsCell slidingWindows
## [115] sort split split<-
## [118] start start<- strand
## [121] strand<- subset subsetByOverlaps
## [124] summary table tail
## [127] tapply tile trim
## [130] union unique update
## [133] updateObject values values<-
## [136] width width<- window
## [139] window<- with xtabs
## [142] xtfrm
## see '?methods' for accessing help and source code
# ?"GRanges-class"
seqnames(mygranges)
## factor-Rle of length 3 with 2 runs
## Lengths: 1 2
## Values : chrII chrI
## Levels(2): chrII chrI
strand(mygranges)
## factor-Rle of length 3 with 3 runs
## Lengths: 1 1 1
## Values : + - +
## Levels(3): + - *
ranges(mygranges)
## IRanges object with 3 ranges and 0 metadata columns:
## start end width
## <integer> <integer> <integer>
## [1] 5 10 6
## [2] 20 30 11
## [3] 25 40 16
start(mygranges)
## [1] 5 20 25
end(mygranges)
## [1] 10 30 40
as.data.frame(mygranges)
## seqnames start end width strand
## 1 chrII 5 10 6 +
## 2 chrI 20 30 11 -
## 3 chrI 25 40 16 +
# GRanges are like vectors:
mygranges[2]
## GRanges object with 1 range and 0 metadata columns:
## seqnames ranges strand
## <Rle> <IRanges> <Rle>
## [1] chrI [20, 30] -
## -------
## seqinfo: 2 sequences from an unspecified genome; no seqlengths
c(mygranges, mygranges)
## GRanges object with 6 ranges and 0 metadata columns:
## seqnames ranges strand
## <Rle> <IRanges> <Rle>
## [1] chrII [ 5, 10] +
## [2] chrI [20, 30] -
## [3] chrI [25, 40] +
## [4] chrII [ 5, 10] +
## [5] chrI [20, 30] -
## [6] chrI [25, 40] +
## -------
## seqinfo: 2 sequences from an unspecified genome; no seqlengths
# Like other vectors, elements may be named
names(mygranges) <- c("foo", "bar", "baz")
mygranges
## GRanges object with 3 ranges and 0 metadata columns:
## seqnames ranges strand
## <Rle> <IRanges> <Rle>
## foo chrII [ 5, 10] +
## bar chrI [20, 30] -
## baz chrI [25, 40] +
## -------
## seqinfo: 2 sequences from an unspecified genome; no seqlengths
# GRanges can have metadata columns, so they are also a little like data frames:
mygranges$wobble <- c(10, 20, 30)
mygranges
## GRanges object with 3 ranges and 1 metadata column:
## seqnames ranges strand | wobble
## <Rle> <IRanges> <Rle> | <numeric>
## foo chrII [ 5, 10] + | 10
## bar chrI [20, 30] - | 20
## baz chrI [25, 40] + | 30
## -------
## seqinfo: 2 sequences from an unspecified genome; no seqlengths
mcols(mygranges)
## DataFrame with 3 rows and 1 column
## wobble
## <numeric>
## 1 10
## 2 20
## 3 30
mygranges$wobble
## [1] 10 20 30
# A handy way to create a GRanges
as("chrI:3-5:+", "GRanges")
## GRanges object with 1 range and 0 metadata columns:
## seqnames ranges strand
## <Rle> <IRanges> <Rle>
## [1] chrI [3, 5] +
## -------
## seqinfo: 1 sequence from an unspecified genome; no seqlengths
# All the operations we saw for IRanges are available for GRanges.
# However most GRanges operations will take strand into account.
#
# Warning: shift() is a notable exception to this.
resize(mygranges, 3, fix="start")
## GRanges object with 3 ranges and 1 metadata column:
## seqnames ranges strand | wobble
## <Rle> <IRanges> <Rle> | <numeric>
## foo chrII [ 5, 7] + | 10
## bar chrI [28, 30] - | 20
## baz chrI [25, 27] + | 30
## -------
## seqinfo: 2 sequences from an unspecified genome; no seqlengths
resize(mygranges, 3, fix="end")
## GRanges object with 3 ranges and 1 metadata column:
## seqnames ranges strand | wobble
## <Rle> <IRanges> <Rle> | <numeric>
## foo chrII [ 8, 10] + | 10
## bar chrI [20, 22] - | 20
## baz chrI [38, 40] + | 30
## -------
## seqinfo: 2 sequences from an unspecified genome; no seqlengths
A DNAString is like a character string of bases “A”, “C”, “G”, “T”, or possibly IUPAC ambiguity codes such as “N” for any base.
myseq <- DNAString("gcgctgctggatgcgaccgcgcatgcgagcgcgacctatccggaa")
myseq
## 45-letter "DNAString" instance
## seq: GCGCTGCTGGATGCGACCGCGCATGCGAGCGCGACCTATCCGGAA
class(myseq)
## [1] "DNAString"
## attr(,"package")
## [1] "Biostrings"
methods(class="DNAString")
## [1] != [
## [3] [<- %in%
## [5] < <=
## [7] == >
## [9] >= aggregate
## [11] alphabetFrequency anyNA
## [13] append as.character
## [15] as.complex as.data.frame
## [17] as.env as.integer
## [19] as.list as.logical
## [21] as.matrix as.numeric
## [23] as.raw as.vector
## [25] by c
## [27] chartr codons
## [29] coerce compact
## [31] compareStrings complement
## [33] countOverlaps countPattern
## [35] countPDict countPWM
## [37] duplicated elementMetadata
## [39] elementMetadata<- eval
## [41] expand expand.grid
## [43] extractAt extractList
## [45] extractROWS findOverlaps
## [47] findPalindromes hasOnlyBaseLetters
## [49] head intersect
## [51] isMatchingEndingAt isMatchingStartingAt
## [53] lcprefix lcsubstr
## [55] lcsuffix length
## [57] lengths letter
## [59] letterFrequency letterFrequencyInSlidingView
## [61] maskMotif masks
## [63] masks<- match
## [65] matchLRPatterns matchPattern
## [67] matchPDict matchProbePair
## [69] matchPWM mcols
## [71] mcols<- merge
## [73] metadata metadata<-
## [75] mstack nchar
## [77] neditEndingAt neditStartingAt
## [79] needwunsQS NROW
## [81] oligonucleotideFrequency overlapsAny
## [83] PairwiseAlignments PairwiseAlignmentsSingleSubject
## [85] palindromeArmLength palindromeLeftArm
## [87] palindromeRightArm parallelSlotNames
## [89] pcompare pmatchPattern
## [91] rank relist
## [93] relistToClass rename
## [95] rep rep.int
## [97] replaceAt replaceLetterAt
## [99] replaceROWS rev
## [101] reverse reverseComplement
## [103] ROWNAMES seqlevelsInUse
## [105] seqtype seqtype<-
## [107] setdiff setequal
## [109] shiftApply show
## [111] showAsCell sort
## [113] split split<-
## [115] subseq subseq<-
## [117] subset subsetByOverlaps
## [119] substr substring
## [121] table tail
## [123] tapply toComplex
## [125] toString translate
## [127] trimLRPatterns twoWayAlphabetFrequency
## [129] union unique
## [131] uniqueLetters unmasked
## [133] updateObject values
## [135] values<- vcountPattern
## [137] vcountPDict Views
## [139] vmatchPattern vmatchPDict
## [141] vwhichPDict which.isMatchingEndingAt
## [143] which.isMatchingStartingAt whichPDict
## [145] window window<-
## [147] with xtabs
## [149] xtfrm xvcopy
## see '?methods' for accessing help and source code
# ?"DNAString-class"
reverseComplement(myseq)
## 45-letter "DNAString" instance
## seq: TTCCGGATAGGTCGCGCTCGCATGCGCGGTCGCATCCAGCAGCGC
translate(myseq)
## 15-letter "AAString" instance
## seq: ALLDATAHASATYPE
subseq(myseq, 3,5)
## 3-letter "DNAString" instance
## seq: GCT
alphabetFrequency(myseq)
## A C G T M R W S Y K V H D B N - + .
## 8 15 16 6 0 0 0 0 0 0 0 0 0 0 0 0 0 0
letterFrequency(myseq, c("AT", "GC"))
## A|T G|C
## 14 31
# DNAString objects behave like vectors.
myseq[3:5]
## 3-letter "DNAString" instance
## seq: GCT
c(myseq, myseq)
## 90-letter "DNAString" instance
## seq: GCGCTGCTGGATGCGACCGCGCATGCGAGCGCGA...GCGACCGCGCATGCGAGCGCGACCTATCCGGAA
# as() converts between types.
as(myseq, "character")
## [1] "GCGCTGCTGGATGCGACCGCGCATGCGAGCGCGACCTATCCGGAA"
as("ACGT", "DNAString")
## 4-letter "DNAString" instance
## seq: ACGT
matchPattern("GCG", myseq)
## Views on a 45-letter DNAString subject
## subject: GCGCTGCTGGATGCGACCGCGCATGCGAGCGCGACCTATCCGGAA
## views:
## start end width
## [1] 1 3 3 [GCG]
## [2] 13 15 3 [GCG]
## [3] 19 21 3 [GCG]
## [4] 25 27 3 [GCG]
## [5] 29 31 3 [GCG]
## [6] 31 33 3 [GCG]
countPattern("GCG", myseq)
## [1] 6
These functions have arguments to allow for some number of substitutions, or even substitutions, insertions, and deletions.
Use pairwiseAlignment()
for more flexible local or global alignment.
Use matchPWM()
for a motif represented as a Position Weight Matrix.
We often want to work with a collection of DNA sequences.
myset <- DNAStringSet(list(chrI=myseq, chrII=DNAString("ACGTACGTAA")))
myset
## A DNAStringSet instance of length 2
## width seq names
## [1] 45 GCGCTGCTGGATGCGACCGCG...CGAGCGCGACCTATCCGGAA chrI
## [2] 10 ACGTACGTAA chrII
class(myset)
## [1] "DNAStringSet"
## attr(,"package")
## [1] "Biostrings"
# ?"DNAStringSet-class"
elementNROWS(myset)
## [1] 45 10
seqinfo(myset)
## Seqinfo object with 2 sequences from an unspecified genome:
## seqnames seqlengths isCircular genome
## chrI 45 NA <NA>
## chrII 10 NA <NA>
# Since a DNAString is like a vector, a DNAStringSet is has to be like a list.
myset$chrII
## 10-letter "DNAString" instance
## seq: ACGTACGTAA
# or myset[["chrII"]]
# or myset[[2]]
# Getting sequences with GRanges
getSeq(myset, mygranges)
## A DNAStringSet instance of length 3
## width seq
## [1] 6 ACGTAA
## [2] 11 GCTCGCATGCG
## [3] 16 GCGAGCGCGACCTATC
getSeq(myset, as("chrI:1-3:+", "GRanges"))
## A DNAStringSet instance of length 1
## width seq
## [1] 3 GCG
getSeq(myset, as("chrI:1-3:-", "GRanges"))
## A DNAStringSet instance of length 1
## width seq
## [1] 3 CGC
# Performs reverse complement if strand is "-".
GRanges can also have seqinfo, allowing various forms of error checking.
Reverse complement the following DNA sequence and then translate to an amino acid sequence:
TTCCATTTCCAT
Genomes and genes for many model organisms are available as Bioconductor packages.
Further data may be available using AnnotationHub.
We will be using packages for yeast in this part of the workshop.
library(BSgenome.Scerevisiae.UCSC.sacCer3)
# A yeast genome object has been loaded.
# Actual sequence data is only loaded from disk as needed.
genome <- Scerevisiae
class(genome)
## [1] "BSgenome"
## attr(,"package")
## [1] "BSgenome"
# ?"BSgenome-class"
seqinfo(genome)
## Seqinfo object with 17 sequences (1 circular) from sacCer3 genome:
## seqnames seqlengths isCircular genome
## chrI 230218 FALSE sacCer3
## chrII 813184 FALSE sacCer3
## chrIII 316620 FALSE sacCer3
## chrIV 1531933 FALSE sacCer3
## chrV 576874 FALSE sacCer3
## ... ... ... ...
## chrXIII 924431 FALSE sacCer3
## chrXIV 784333 FALSE sacCer3
## chrXV 1091291 FALSE sacCer3
## chrXVI 948066 FALSE sacCer3
## chrM 85779 TRUE sacCer3
genome$chrM
## 85779-letter "DNAString" instance
## seq: TTCATAATTAATTTTTTATATATATATTATATTA...AGAAATATGCTTAATTATAATATAATATCCATA
library(TxDb.Scerevisiae.UCSC.sacCer3.sgdGene)
## Loading required package: GenomicFeatures
## Loading required package: AnnotationDbi
## Loading required package: Biobase
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
# An object referring to a yeast transcriptome database has been loaded.
# Actual data is only loaded from disk as needed.
txdb <- TxDb.Scerevisiae.UCSC.sacCer3.sgdGene
class(txdb)
## [1] "TxDb"
## attr(,"package")
## [1] "GenomicFeatures"
# ?"TxDb-class"
Genes have transcripts. Transcripts have exons, and CDSs if they are protein coding.
The GRange for a transcript spans all of its exons. The GRange for a gene spans all the exons of all of its transcripts.
For example:
=============================================== gene
======================== transcript1 of gene
======= ==== ======== exons of transcript1
==== ==== === CDSs of transcript1
======================================= transcript2 of gene
====== ====== ========== exons of transcript2
==== ====== === CDSs of transcript2
genes(txdb)
## GRanges object with 6534 ranges and 1 metadata column:
## seqnames ranges strand | gene_id
## <Rle> <IRanges> <Rle> | <character>
## Q0010 chrM [ 3952, 4415] + | Q0010
## Q0032 chrM [11667, 11957] + | Q0032
## Q0055 chrM [13818, 26701] + | Q0055
## Q0075 chrM [24156, 25255] + | Q0075
## Q0080 chrM [27666, 27812] + | Q0080
## ... ... ... ... . ...
## YPR200C chrXVI [939279, 939671] - | YPR200C
## YPR201W chrXVI [939922, 941136] + | YPR201W
## YPR202W chrXVI [943032, 944188] + | YPR202W
## YPR204C-A chrXVI [946856, 947338] - | YPR204C-A
## YPR204W chrXVI [944603, 947701] + | YPR204W
## -------
## seqinfo: 17 sequences (1 circular) from sacCer3 genome
transcriptsBy(txdb, "gene")
## GRangesList object of length 6534:
## $Q0010
## GRanges object with 2 ranges and 2 metadata columns:
## seqnames ranges strand | tx_id tx_name
## <Rle> <IRanges> <Rle> | <integer> <character>
## [1] chrM [3952, 4338] + | 6665 Q0010
## [2] chrM [4254, 4415] + | 6666 Q0017
##
## $Q0032
## GRanges object with 1 range and 2 metadata columns:
## seqnames ranges strand | tx_id tx_name
## [1] chrM [11667, 11957] + | 6667 Q0032
##
## $Q0055
## GRanges object with 6 ranges and 2 metadata columns:
## seqnames ranges strand | tx_id tx_name
## [1] chrM [13818, 16322] + | 6668 Q0050
## [2] chrM [13818, 18830] + | 6669 Q0055
## [3] chrM [13818, 19996] + | 6670 Q0060
## [4] chrM [13818, 21935] + | 6671 Q0065
## [5] chrM [13818, 23167] + | 6672 Q0070
## [6] chrM [13818, 26701] + | 6673 Q0045
##
## ...
## <6531 more elements>
## -------
## seqinfo: 17 sequences (1 circular) from sacCer3 genome
exonsBy(txdb, "tx", use.names=TRUE)
## GRangesList object of length 6692:
## $YAL069W
## GRanges object with 1 range and 3 metadata columns:
## seqnames ranges strand | exon_id exon_name exon_rank
## <Rle> <IRanges> <Rle> | <integer> <character> <integer>
## [1] chrI [335, 649] + | 1 <NA> 1
##
## $YAL068W-A
## GRanges object with 1 range and 3 metadata columns:
## seqnames ranges strand | exon_id exon_name exon_rank
## [1] chrI [538, 792] + | 2 <NA> 1
##
## $YAL067W-A
## GRanges object with 1 range and 3 metadata columns:
## seqnames ranges strand | exon_id exon_name exon_rank
## [1] chrI [2480, 2707] + | 3 <NA> 1
##
## ...
## <6689 more elements>
## -------
## seqinfo: 17 sequences (1 circular) from sacCer3 genome
cdsBy(txdb, "tx", use.names=TRUE)
## GRangesList object of length 6692:
## $YAL069W
## GRanges object with 1 range and 3 metadata columns:
## seqnames ranges strand | cds_id cds_name exon_rank
## <Rle> <IRanges> <Rle> | <integer> <character> <integer>
## [1] chrI [335, 649] + | 1 <NA> 1
##
## $YAL068W-A
## GRanges object with 1 range and 3 metadata columns:
## seqnames ranges strand | cds_id cds_name exon_rank
## [1] chrI [538, 792] + | 2 <NA> 1
##
## $YAL067W-A
## GRanges object with 1 range and 3 metadata columns:
## seqnames ranges strand | cds_id cds_name exon_rank
## [1] chrI [2480, 2707] + | 3 <NA> 1
##
## ...
## <6689 more elements>
## -------
## seqinfo: 17 sequences (1 circular) from sacCer3 genome
To illustrate the use of a TxDb, let us try to extract the start codons of yeast. Our first step is to obtain the locations of the coding sequences.
cds_list <- cdsBy(txdb, "tx", use.names=TRUE)
cds_list
## GRangesList object of length 6692:
## $YAL069W
## GRanges object with 1 range and 3 metadata columns:
## seqnames ranges strand | cds_id cds_name exon_rank
## <Rle> <IRanges> <Rle> | <integer> <character> <integer>
## [1] chrI [335, 649] + | 1 <NA> 1
##
## $YAL068W-A
## GRanges object with 1 range and 3 metadata columns:
## seqnames ranges strand | cds_id cds_name exon_rank
## [1] chrI [538, 792] + | 2 <NA> 1
##
## $YAL067W-A
## GRanges object with 1 range and 3 metadata columns:
## seqnames ranges strand | cds_id cds_name exon_rank
## [1] chrI [2480, 2707] + | 3 <NA> 1
##
## ...
## <6689 more elements>
## -------
## seqinfo: 17 sequences (1 circular) from sacCer3 genome
class(cds_list)
## [1] "GRangesList"
## attr(,"package")
## [1] "GenomicRanges"
head( elementNROWS(cds_list) )
## YAL069W YAL068W-A YAL067W-A YAL066W YAL064W-B YAL064W
## 1 1 1 1 1 1
table( elementNROWS(cds_list) )
##
## 1 2 3 4 5 6 8
## 6363 312 12 2 1 1 1
cds_list[ elementNROWS(cds_list) >= 2 ]
## GRangesList object of length 329:
## $YAL030W
## GRanges object with 2 ranges and 3 metadata columns:
## seqnames ranges strand | cds_id cds_name exon_rank
## <Rle> <IRanges> <Rle> | <integer> <character> <integer>
## [1] chrI [87286, 87387] + | 26 <NA> 1
## [2] chrI [87501, 87752] + | 27 <NA> 2
##
## $YAL003W
## GRanges object with 2 ranges and 3 metadata columns:
## seqnames ranges strand | cds_id cds_name exon_rank
## [1] chrI [142174, 142253] + | 40 <NA> 1
## [2] chrI [142620, 143160] + | 41 <NA> 2
##
## $YAL001C
## GRanges object with 2 ranges and 3 metadata columns:
## seqnames ranges strand | cds_id cds_name exon_rank
## [1] chrI [151097, 151166] - | 107 <NA> 1
## [2] chrI [147594, 151006] - | 106 <NA> 2
##
## ...
## <326 more elements>
## -------
## seqinfo: 17 sequences (1 circular) from sacCer3 genome
seqinfo(cds_list)
## Seqinfo object with 17 sequences (1 circular) from sacCer3 genome:
## seqnames seqlengths isCircular genome
## chrI 230218 <NA> sacCer3
## chrII 813184 <NA> sacCer3
## chrIII 316620 <NA> sacCer3
## chrIV 1531933 <NA> sacCer3
## chrV 576874 <NA> sacCer3
## ... ... ... ...
## chrXIII 924431 <NA> sacCer3
## chrXIV 784333 <NA> sacCer3
## chrXV 1091291 <NA> sacCer3
## chrXVI 948066 <NA> sacCer3
## chrM 85779 TRUE sacCer3
genome(cds_list)
## chrI chrII chrIII chrIV chrV chrVI chrVII
## "sacCer3" "sacCer3" "sacCer3" "sacCer3" "sacCer3" "sacCer3" "sacCer3"
## chrVIII chrIX chrX chrXI chrXII chrXIII chrXIV
## "sacCer3" "sacCer3" "sacCer3" "sacCer3" "sacCer3" "sacCer3" "sacCer3"
## chrXV chrXVI chrM
## "sacCer3" "sacCer3" "sacCer3"
GRanges and GRangesLists extracted from the txdb have associated seqinfo. Bioconductor will give an error if you try to use them with the wrong genome.
# The ...By functions return GRangesList objects.
# To flatten down to a GRanges, use unlist.
unlist(cds_list)
## GRanges object with 7050 ranges and 3 metadata columns:
## seqnames ranges strand | cds_id cds_name
## <Rle> <IRanges> <Rle> | <integer> <character>
## YAL069W chrI [ 335, 649] + | 1 <NA>
## YAL068W-A chrI [ 538, 792] + | 2 <NA>
## YAL067W-A chrI [ 2480, 2707] + | 3 <NA>
## YAL066W chrI [10091, 10399] + | 4 <NA>
## YAL064W-B chrI [12046, 12426] + | 5 <NA>
## ... ... ... ... . ... ...
## Q0255 chrM [74495, 75622] + | 7030 <NA>
## Q0255 chrM [75663, 75872] + | 7031 <NA>
## Q0255 chrM [75904, 75984] + | 7032 <NA>
## Q0275 chrM [79213, 80022] + | 7033 <NA>
## Q0297 chrM [85554, 85709] + | 7034 <NA>
## exon_rank
## <integer>
## YAL069W 1
## YAL068W-A 1
## YAL067W-A 1
## YAL066W 1
## YAL064W-B 1
## ... ...
## Q0255 1
## Q0255 2
## Q0255 3
## Q0275 1
## Q0297 1
## -------
## seqinfo: 17 sequences (1 circular) from sacCer3 genome
# Note that names will not be unique unless each element has length 1.
cds_ranges <- unlist( range(cds_list) )
start_codons <- resize(cds_ranges, 3, fix="start")
start_seqs <- getSeq(genome, start_codons)
table(start_seqs)
## start_seqs
## AGC AGT ATA ATG
## 1 3 6 6682
Many Bioconductor types have a List version. This allows use of the split-apply-combine pattern. List classes support most of the same functions as their base type. If a function isn’t supported, use lapply.
Here
Use split()
to perform the splitting step manually.
In DNA, a stop codon can be TAG, TAA or TGA.
Are stop codons included in the CDS sequence, or just after it?
Does yeast have a preferred stop codon?
Hint: Use resize(..., fix="end")
and flank(..., start=FALSE)
The above approach to extracting start codons has a potential flaw: an intron may lie within the start codon.
# An alternative way to get start codons.
cds_seqs <- extractTranscriptSeqs(genome, cds_list)
table( subseq(cds_seqs, 1, 3) )
##
## ATA ATG
## 6 6686
It actually happens! “AGC” and “AGT” which we saw previously are gone when we handle introns correctly.
cds_list[ start_seqs == "AGC" ]
## GRangesList object of length 1:
## $YIL111W
## GRanges object with 2 ranges and 3 metadata columns:
## seqnames ranges strand | cds_id cds_name exon_rank
## <Rle> <IRanges> <Rle> | <integer> <character> <integer>
## [1] chrIX [155222, 155222] + | 3200 <NA> 1
## [2] chrIX [155311, 155765] + | 3201 <NA> 2
##
## -------
## seqinfo: 17 sequences (1 circular) from sacCer3 genome
cds_list[ start_seqs == "AGT" ]
## GRangesList object of length 3:
## $YJL041W
## GRanges object with 2 ranges and 3 metadata columns:
## seqnames ranges strand | cds_id cds_name exon_rank
## <Rle> <IRanges> <Rle> | <integer> <character> <integer>
## [1] chrX [365784, 365784] + | 3523 <NA> 1
## [2] chrX [365903, 368373] + | 3524 <NA> 2
##
## $YMR242C
## GRanges object with 2 ranges and 3 metadata columns:
## seqnames ranges strand | cds_id cds_name exon_rank
## [1] chrXIII [754220, 754220] - | 5309 <NA> 1
## [2] chrXIII [753225, 753742] - | 5308 <NA> 2
##
## $YOR312C
## GRanges object with 2 ranges and 3 metadata columns:
## seqnames ranges strand | cds_id cds_name exon_rank
## [1] chrXV [901194, 901194] - | 6397 <NA> 1
## [2] chrXV [900250, 900767] - | 6396 <NA> 2
##
## -------
## seqinfo: 17 sequences (1 circular) from sacCer3 genome
It’s common to want to characterize some collection of locations in a genome: transcription start or end sites, protein binding sites, etc. Bioconductor’s core packages make a first pass at characterization fairly simple. You might then proceed to a specialized tool such as a de-novo motif finder.
As an example, let’s look at sequence upstrand of CDS regions.
library(seqLogo)
## Loading required package: grid
upstrand_ranges <- flank(cds_ranges, 10, start=TRUE)
upstrand_seq <- getSeq(genome, upstrand_ranges)
upstrand_seq
## A DNAStringSet instance of length 6692
## width seq names
## [1] 10 ACGCTAACAA YAL069W
## [2] 10 TCACATCATT YAL068W-A
## [3] 10 GAATGTGGGA YAL067W-A
## [4] 10 AAGCACCATC YAL066W
## [5] 10 CGTCGAAACA YAL064W-B
## ... ... ...
## [6688] 10 TTATTTATTA Q0182
## [6689] 10 ATTTATTAAA Q0250
## [6690] 10 AATTTTTGGA Q0255
## [6691] 10 CAATAAATTT Q0275
## [6692] 10 AATAAATAAT Q0297
letter_counts <- consensusMatrix(upstrand_seq)
probs <- prop.table(letter_counts[1:4,], 2)
seqLogo(probs, ic.scale=FALSE)
seqLogo(probs)
# We may also be interested in k-mer frequencies
colSums( oligonucleotideFrequency(upstrand_seq, 2) )
## AA AC AG AT CA CC CG CT GA GC GG GT
## 10308 4067 4450 5359 4993 1703 1610 2454 3653 2079 1657 2468
## TA TC TG TT
## 5687 2919 2205 4616
sessionInfo()
## R version 3.3.3 (2017-03-06)
## Platform: x86_64-apple-darwin16.4.0 (64-bit)
## Running under: macOS Sierra 10.12.4
##
## locale:
## [1] en_AU.UTF-8/en_AU.UTF-8/en_AU.UTF-8/C/en_AU.UTF-8/en_AU.UTF-8
##
## attached base packages:
## [1] grid parallel stats4 methods stats graphics grDevices
## [8] utils datasets base
##
## other attached packages:
## [1] seqLogo_1.40.0
## [2] TxDb.Scerevisiae.UCSC.sacCer3.sgdGene_3.2.2
## [3] GenomicFeatures_1.26.4
## [4] AnnotationDbi_1.36.2
## [5] Biobase_2.34.0
## [6] BSgenome.Scerevisiae.UCSC.sacCer3_1.4.0
## [7] BSgenome_1.42.0
## [8] rtracklayer_1.34.2
## [9] Biostrings_2.42.1
## [10] XVector_0.14.1
## [11] GenomicRanges_1.26.4
## [12] GenomeInfoDb_1.10.3
## [13] IRanges_2.8.2
## [14] S4Vectors_0.12.2
## [15] BiocGenerics_0.20.0
##
## loaded via a namespace (and not attached):
## [1] Rcpp_0.12.10 knitr_1.15.1
## [3] magrittr_1.5 GenomicAlignments_1.10.1
## [5] zlibbioc_1.20.0 BiocParallel_1.8.2
## [7] lattice_0.20-34 stringr_1.2.0
## [9] tools_3.3.3 SummarizedExperiment_1.4.0
## [11] DBI_0.6-1 htmltools_0.3.5
## [13] yaml_2.1.14 rprojroot_1.2
## [15] digest_0.6.12 Matrix_1.2-8
## [17] bitops_1.0-6 biomaRt_2.30.0
## [19] RCurl_1.95-4.8 memoise_1.0.0
## [21] RSQLite_1.1-2 evaluate_0.10
## [23] rmarkdown_1.4 stringi_1.1.5
## [25] backports_1.0.5 Rsamtools_1.26.2
## [27] XML_3.98-1.7