library(knitr)
knitr::opts_chunk$set(fig.width=8,
fig.height=6, echo=T,
warning=FALSE, message=FALSE,
prompt=T,tidy=T,tidy.opts=list(width.cutoff=50),
include=TRUE,cache=TRUE)
# To output a pdf or html document simply substitute github_document with pdf_document or html_document in the header of the file
# Working directory - To be changed once the repository is cloned
dir <- "."
> # Bioconductor packages
> source("https://bioconductor.org/biocLite.R")
> biocLite(c("GenomicRanges", "rtracklayer", "Rsamtools",
+ "GenomicAlignments", "TxDb.Hsapiens.UCSC.hg19.knownGene",
+ "AnnotationHub"))
>
> # CRAN packages
> install.packages("ggplot2")
> install.packages("knitr")
> # Load packages
> library("ggplot2")
> library("GenomicRanges")
> library("Rsamtools")
> library("GenomicAlignments")
> library("rtracklayer")
> library("TxDb.Hsapiens.UCSC.hg19.knownGene")
> library("AnnotationHub")
> library("knitr")
GenomicRanges
Imagine you have done an experiment on Sample 1 and your output is a set of genomic ranges. Let’s start by manually creating a very simple GRanges
objects with ranges on chr1 and chr2.
> # Create and example: Sample 1
> gr_S1 <- GRanges(seqnames = rep(c("chr1", "chr2"),
+ times = c(3, 2)), ranges = IRanges(start = c(5,
+ 8, 20, 8, 18), end = c(11, 15, 26, 16, 21), names = c(paste("Range_",
+ 1:5, sep = ""))), strand = rep(strand(c("*")),
+ times = 5))
>
> gr_S1
## GRanges object with 5 ranges and 0 metadata columns:
## seqnames ranges strand
## <Rle> <IRanges> <Rle>
## Range_1 chr1 [ 5, 11] *
## Range_2 chr1 [ 8, 15] *
## Range_3 chr1 [20, 26] *
## Range_4 chr2 [ 8, 16] *
## Range_5 chr2 [18, 21] *
## -------
## seqinfo: 2 sequences from an unspecified genome; no seqlengths
A very common analysis to perform is to evaluate to what extent and where your ranges overlap with some features, such as genes, exons, etc… As an example we create a simple fake gene annotation to use with the ranges created above.
> # Gene annotation
> genes <- GRanges(seqnames = rep(c("chr1", "chr2"),
+ times = c(2, 2)), ranges = IRanges(start = c(7,
+ 17, 7, 23), end = c(15, 23, 14, 26), names = c(paste("Gene_",
+ 1:4, sep = ""))), strand = rep(strand(c("+")),
+ times = 4))
>
> genes
## GRanges object with 4 ranges and 0 metadata columns:
## seqnames ranges strand
## <Rle> <IRanges> <Rle>
## Gene_1 chr1 [ 7, 15] +
## Gene_2 chr1 [17, 23] +
## Gene_3 chr2 [ 7, 14] +
## Gene_4 chr2 [23, 26] +
## -------
## seqinfo: 2 sequences from an unspecified genome; no seqlengths
Figure 1 is a simple way of plotting ranges stored into Granges
object using geom_rect()
from ggplot2
. More advanced plotting options are available but this simple strategy is sufficient for now and also every excuse is good to use ggplot
!
> # Plot ranges with ggplot2
> gr <- as.data.frame(rbind(as.data.frame(gr_S1), as.data.frame(genes)))
> gr$rangeID <- c(names(gr_S1), names(genes))
> gr$Sample <- c(rep("Peaks", length(gr_S1)), rep("Genes",
+ length(genes)))
>
> ggplot(data = gr, aes(xmin = start, xmax = end, ymin = 0,
+ ymax = 1)) + geom_rect(aes(fill = rangeID), alpha = 0.4) +
+ facet_wrap(~Sample + seqnames) + theme_bw() + labs(x = "Genomic position",
+ y = "Ranges") + theme(axis.ticks.y = element_blank(),
+ axis.text.y = element_blank())
GRanges
objectsfindOverlaps()
By default looks for overlaps of a minimum of 1bp between a query
and a subject
.
> ### `findOverlaps()`
> `?`(findOverlaps)
## Help on topic 'findOverlaps' was found in the following packages:
##
## Package Library
## SummarizedExperiment
## /wehisan/general/system/bioinf-software/bioinfsoftware/R/R-3.3.0/lib64/R/library
## IRanges /wehisan/general/system/bioinf-software/bioinfsoftware/R/R-3.3.0/lib64/R/library
## GenomicRanges /wehisan/home/allstaff/q/quaglieri.a/R/x86_64-pc-linux-gnu-library/3.3
## GenomicAlignments /wehisan/general/system/bioinf-software/bioinfsoftware/R/R-3.3.0/lib64/R/library
##
##
## Using the first match ...
> overlaps <- findOverlaps(query = gr_S1, subject = genes)
> overlaps
## Hits object with 4 hits and 0 metadata columns:
## queryHits subjectHits
## <integer> <integer>
## [1] 1 1
## [2] 2 1
## [3] 3 2
## [4] 4 3
## -------
## queryLength: 5 / subjectLength: 4
> # Query the output
> queryHits(overlaps)
## [1] 1 2 3 4
> subjectHits(overlaps)
## [1] 1 1 2 3
> subjectLength(overlaps)
## [1] 4
> # You can allow for a gap to be ignored
> overlaps <- findOverlaps(gr_S1, genes, maxgap = 5)
> overlaps
## Hits object with 8 hits and 0 metadata columns:
## queryHits subjectHits
## <integer> <integer>
## [1] 1 1
## [2] 2 1
## [3] 2 2
## [4] 3 1
## [5] 3 2
## [6] 4 3
## [7] 5 3
## [8] 5 4
## -------
## queryLength: 5 / subjectLength: 4
> # You can specify a minimum overlap
> overlaps <- findOverlaps(gr_S1, genes, minoverlap = 5)
> overlaps
## Hits object with 3 hits and 0 metadata columns:
## queryHits subjectHits
## <integer> <integer>
## [1] 1 1
## [2] 2 1
## [3] 4 3
## -------
## queryLength: 5 / subjectLength: 4
> # You can specify a minimum overlap
> overlaps <- findOverlaps(gr_S1, genes, type = "start")
> overlaps
## Hits object with 0 hits and 0 metadata columns:
## queryHits subjectHits
## <integer> <integer>
## -------
## queryLength: 5 / subjectLength: 4
> overlaps <- findOverlaps(gr_S1, genes, type = "end")
> overlaps
## Hits object with 1 hit and 0 metadata columns:
## queryHits subjectHits
## <integer> <integer>
## [1] 2 1
## -------
## queryLength: 5 / subjectLength: 4
> overlaps <- findOverlaps(gr_S1, genes, type = "within")
> overlaps
## Hits object with 1 hit and 0 metadata columns:
## queryHits subjectHits
## <integer> <integer>
## [1] 2 1
## -------
## queryLength: 5 / subjectLength: 4
countOverlaps()
> ## `countOverlaps()`
> `?`(countOverlaps)
## Help on topic 'countOverlaps' was found in the following packages:
##
## Package Library
## IRanges /wehisan/general/system/bioinf-software/bioinfsoftware/R/R-3.3.0/lib64/R/library
## GenomicRanges /wehisan/home/allstaff/q/quaglieri.a/R/x86_64-pc-linux-gnu-library/3.3
##
##
## Using the first match ...
> N_overlaps <- countOverlaps(gr_S1, genes)
> N_overlaps
## Range_1 Range_2 Range_3 Range_4 Range_5
## 1 1 1 1 0
> # You can play around with the same options as
> # findOverlaps()
GenomicRanges
nearest()
> ## Nearest-methods in `GenomicRanges` `nearest()`
> `?`(nearest)
## Help on topic 'nearest' was found in the following packages:
##
## Package Library
## SummarizedExperiment
## /wehisan/general/system/bioinf-software/bioinfsoftware/R/R-3.3.0/lib64/R/library
## IRanges /wehisan/general/system/bioinf-software/bioinfsoftware/R/R-3.3.0/lib64/R/library
## GenomicRanges /wehisan/home/allstaff/q/quaglieri.a/R/x86_64-pc-linux-gnu-library/3.3
##
##
## Using the first match ...
It returns a vector of indeces referring to the nearest neighbour range in subject
for every range in x
. By default if one range overlaps with multiple genes then one overlap will be chosen at random:
> nearest(x = gr_S1, subject = genes)
## [1] 1 1 2 3 4
> nearest(x = gr_S1, subject = genes, select = "all")
## Hits object with 5 hits and 0 metadata columns:
## queryHits subjectHits
## <integer> <integer>
## [1] 1 1
## [2] 2 1
## [3] 3 2
## [4] 4 3
## [5] 5 4
## -------
## queryLength: 5 / subjectLength: 4
> nearest(gr_S1)
## [1] 2 1 2 5 4
> nearest(gr_S1, gr_S1)
## [1] 1 1 3 4 5
distance()
> ### `distance()`
> `?`(distance)
## Help on topic 'distance' was found in the following packages:
##
## Package Library
## SummarizedExperiment
## /wehisan/general/system/bioinf-software/bioinfsoftware/R/R-3.3.0/lib64/R/library
## IRanges /wehisan/general/system/bioinf-software/bioinfsoftware/R/R-3.3.0/lib64/R/library
## GenomicRanges /wehisan/home/allstaff/q/quaglieri.a/R/x86_64-pc-linux-gnu-library/3.3
##
##
## Using the first match ...
> distance(x = gr_S1[1], y = genes)
## [1] 0 5 NA NA
> distance(x = gr_S1[1:4], y = genes)
## [1] 0 1 NA 6
> distance(x = gr_S1, y = genes)
## [1] 0 1 NA 6 NA
distance()
is a symmetric function which means that it requires x and y to have to have the same lenght and if one is shorter than the other one it will be recycled to match the length of the longest. Also, the distance between two consecutive blocks is 0 not 1 which affects the notion of overlaps. If distance(x, y) == 0
then x and y can be either adjacent or overlapping ranges.
distanceToNearest()
> ### `distanceToNearest()`
> `?`(distanceToNearest)
## Help on topic 'distanceToNearest' was found in the following
## packages:
##
## Package Library
## SummarizedExperiment
## /wehisan/general/system/bioinf-software/bioinfsoftware/R/R-3.3.0/lib64/R/library
## IRanges /wehisan/general/system/bioinf-software/bioinfsoftware/R/R-3.3.0/lib64/R/library
## GenomicRanges /wehisan/home/allstaff/q/quaglieri.a/R/x86_64-pc-linux-gnu-library/3.3
##
##
## Using the first match ...
For every range in x
it will return the index and the distance to its nearest neighbour in subject
.
> distanceToNearest(x = gr_S1, subject = genes)
## Hits object with 5 hits and 1 metadata column:
## queryHits subjectHits | distance
## <integer> <integer> | <integer>
## [1] 1 1 | 0
## [2] 2 1 | 0
## [3] 3 2 | 0
## [4] 4 3 | 0
## [5] 5 4 | 1
## -------
## queryLength: 5 / subjectLength: 4
GRangesList
GRangesList
are lists of GRanges
objects.
> # Create a GRangesList Sample 1
> gr_S1 <- GRanges(seqnames = rep(c("chr1", "chr2"),
+ times = c(3, 2)), ranges = IRanges(start = c(5,
+ 8, 20, 8, 18), end = c(11, 15, 26, 16, 21), names = c(paste("Region_",
+ 1:5, sep = ""))), strand = rep("*", times = 5))
>
> gr_S1
## GRanges object with 5 ranges and 0 metadata columns:
## seqnames ranges strand
## <Rle> <IRanges> <Rle>
## Region_1 chr1 [ 5, 11] *
## Region_2 chr1 [ 8, 15] *
## Region_3 chr1 [20, 26] *
## Region_4 chr2 [ 8, 16] *
## Region_5 chr2 [18, 21] *
## -------
## seqinfo: 2 sequences from an unspecified genome; no seqlengths
> # Sample 2
> gr_S2 <- GRanges(seqnames = rep(c("chr2", "chr3"),
+ times = c(3, 5)), ranges = IRanges(start = c(1:8),
+ width = 10, names = c(paste("Region_", 1:8, sep = ""))),
+ strand = rep("*", times = 8))
>
> gr_S2
## GRanges object with 8 ranges and 0 metadata columns:
## seqnames ranges strand
## <Rle> <IRanges> <Rle>
## Region_1 chr2 [1, 10] *
## Region_2 chr2 [2, 11] *
## Region_3 chr2 [3, 12] *
## Region_4 chr3 [4, 13] *
## Region_5 chr3 [5, 14] *
## Region_6 chr3 [6, 15] *
## Region_7 chr3 [7, 16] *
## Region_8 chr3 [8, 17] *
## -------
## seqinfo: 2 sequences from an unspecified genome; no seqlengths
> # GRanges List
> list_ranges <- GRangesList(Sample1 = gr_S1, Sample2 = gr_S2)
> length(list)
## [1] 1
Many of the functions learnt for GRanges
can also be applied to GRangesList
objects even though the output will have to be interepreted accordingly:
> names(list_ranges)
## [1] "Sample1" "Sample2"
> length(list_ranges)
## [1] 2
> seqnames(list_ranges)
## RleList of length 2
## $Sample1
## factor-Rle of length 5 with 2 runs
## Lengths: 3 2
## Values : chr1 chr2
## Levels(3): chr1 chr2 chr3
##
## $Sample2
## factor-Rle of length 8 with 2 runs
## Lengths: 3 5
## Values : chr2 chr3
## Levels(3): chr1 chr2 chr3
> strand(list_ranges)
## RleList of length 2
## $Sample1
## factor-Rle of length 5 with 1 run
## Lengths: 5
## Values : *
## Levels(3): + - *
##
## $Sample2
## factor-Rle of length 8 with 1 run
## Lengths: 8
## Values : *
## Levels(3): + - *
> ranges(list_ranges)
## IRangesList of length 2
## $Sample1
## IRanges object with 5 ranges and 0 metadata columns:
## start end width
## <integer> <integer> <integer>
## Region_1 5 11 7
## Region_2 8 15 8
## Region_3 20 26 7
## Region_4 8 16 9
## Region_5 18 21 4
##
## $Sample2
## IRanges object with 8 ranges and 0 metadata columns:
## start end width
## <integer> <integer> <integer>
## Region_1 1 10 10
## Region_2 2 11 10
## Region_3 3 12 10
## Region_4 4 13 10
## Region_5 5 14 10
## Region_6 6 15 10
## Region_7 7 16 10
## Region_8 8 17 10
> start(list_ranges)
## IntegerList of length 2
## [["Sample1"]] 5 8 20 8 18
## [["Sample2"]] 1 2 3 4 5 6 7 8
> end(list_ranges)
## IntegerList of length 2
## [["Sample1"]] 11 15 26 16 21
## [["Sample2"]] 10 11 12 13 14 15 16 17
> width(list_ranges)
## IntegerList of length 2
## [["Sample1"]] 7 8 7 9 4
## [["Sample2"]] 10 10 10 10 10 10 10 10
> unlist(list_ranges)
## GRanges object with 13 ranges and 0 metadata columns:
## seqnames ranges strand
## <Rle> <IRanges> <Rle>
## Sample1.Region_1 chr1 [ 5, 11] *
## Sample1.Region_2 chr1 [ 8, 15] *
## Sample1.Region_3 chr1 [20, 26] *
## Sample1.Region_4 chr2 [ 8, 16] *
## Sample1.Region_5 chr2 [18, 21] *
## ... ... ... ...
## Sample2.Region_4 chr3 [4, 13] *
## Sample2.Region_5 chr3 [5, 14] *
## Sample2.Region_6 chr3 [6, 15] *
## Sample2.Region_7 chr3 [7, 16] *
## Sample2.Region_8 chr3 [8, 17] *
## -------
## seqinfo: 3 sequences from an unspecified genome; no seqlengths
To get the number of ranges in every object of the list use elementNROWS
:
> elementNROWS(list_ranges)
## Sample1 Sample2
## 5 8
Granges
listAdd an extra column to every GRanges
in the GRangesList
.
> addCols <- lapply(list_ranges, function(x) {
+ elementMetadata(x) <- data.frame(NumberReads = rbinom(length(x),
+ size = 100, prob = 0.5))
+ return(x)
+ })
> class(addCols)
## [1] "list"
> addCols <- GRangesList(addCols)
> class(addCols)
## [1] "GRangesList"
## attr(,"package")
## [1] "GenomicRanges"
In many cases Granges
objects can be subsetted using the same rules that apply to normal lists or data frames in R:
> # Subsetting As a list
> addCols[[1]]
## GRanges object with 5 ranges and 1 metadata column:
## seqnames ranges strand | NumberReads
## <Rle> <IRanges> <Rle> | <integer>
## Region_1 chr1 [ 5, 11] * | 45
## Region_2 chr1 [ 8, 15] * | 61
## Region_3 chr1 [20, 26] * | 49
## Region_4 chr2 [ 8, 16] * | 57
## Region_5 chr2 [18, 21] * | 58
## -------
## seqinfo: 3 sequences from an unspecified genome; no seqlengths
> addCols["Sample1"]
## GRangesList object of length 1:
## $Sample1
## GRanges object with 5 ranges and 1 metadata column:
## seqnames ranges strand | NumberReads
## <Rle> <IRanges> <Rle> | <integer>
## Region_1 chr1 [ 5, 11] * | 45
## Region_2 chr1 [ 8, 15] * | 61
## Region_3 chr1 [20, 26] * | 49
## Region_4 chr2 [ 8, 16] * | 57
## Region_5 chr2 [18, 21] * | 58
##
## -------
## seqinfo: 3 sequences from an unspecified genome; no seqlengths
> # As a data.frame object
> addCols[1, "NumberReads"]
## GRangesList object of length 1:
## $Sample1
## GRanges object with 5 ranges and 1 metadata column:
## seqnames ranges strand | NumberReads
## <Rle> <IRanges> <Rle> | <integer>
## Region_1 chr1 [ 5, 11] * | 45
## Region_2 chr1 [ 8, 15] * | 61
## Region_3 chr1 [20, 26] * | 49
## Region_4 chr2 [ 8, 16] * | 57
## Region_5 chr2 [18, 21] * | 58
##
## -------
## seqinfo: 3 sequences from an unspecified genome; no seqlengths
> addCols["Sample1", "NumberReads"]
## GRangesList object of length 1:
## $Sample1
## GRanges object with 5 ranges and 1 metadata column:
## seqnames ranges strand | NumberReads
## <Rle> <IRanges> <Rle> | <integer>
## Region_1 chr1 [ 5, 11] * | 45
## Region_2 chr1 [ 8, 15] * | 61
## Region_3 chr1 [20, 26] * | 49
## Region_4 chr2 [ 8, 16] * | 57
## Region_5 chr2 [18, 21] * | 58
##
## -------
## seqinfo: 3 sequences from an unspecified genome; no seqlengths
> # Looping
> lapply(addCols, length)
## $Sample1
## [1] 5
##
## $Sample2
## [1] 8
> sapply(addCols, length)
## Sample1 Sample2
## 5 8
Similar considerations apply for the other functions explored above like findOverlaps()
, countOverlaps()
, etc…
Use the code below to download the
cpg_islands
object containing the CpG islands for the Human chrY and thetxdb
gene annotation fromTxDb.Hsapiens.UCSC.hg19.knownGene
.
> # Download CpG islands from UCSC genome browser in
> # the same way you can extract many more genomic
> # object
> session <- browserSession("UCSC")
> query <- ucscTableQuery(session, "CpG Islands", GRangesForUCSCGenome("hg19",
+ "chrY"))
> cpg_islands <- getTable(query)
> # Download genes
> txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
- What is the class of
cpg_islands
? Convertcpg_islands
to aGRanges
object.- Using what you have previosly learnt, get the transcript for every gene from the
txdb
gene annotation and store them into thetranscriptHg19
object (suggestion: seetranscriptsBy
).- Subset
transcriptHg19
to extract only the transcript on chr21 (suggestion: first run unlist(transcriptHg19)).- Find the nearest CpG island in
cpg_islandsGR
to every transcript intranscriptHg19
as well as their distance.- Optional: On average, every CpG island is close to how many genes?
Rtracklayer
Now that the structure and manipulation of GRanges
objects is clear, the rtracklayer
package comes as a useful tool to import/export GRanges
objects from and to different data formats commonly used in genomic analyses. Rtracklayer
stands as an interface to mediate the crosstalk between R and genome browsers (UCSC built-in). Example of data files supported by this package are BED, bigWIG, wig, gff and gtf formats.
Another very important function of this package is to allow the visualisations of genomic annotation tracks. However, this part will not covered in the present tutorial. For more references see the Rtracklayer Bioconductor page.
There is a very large number of different data formats used to store different sort of genomic data. In this tutororial only few of them will be covered as example. However, the UCSC genome growser offers a useful FAQ page where one can find the specification of all mandatory and optional fields for every data format.
The GFF and GTF are the preferred format used to store annotations and their exact specification is summarised in GFF/GTF File Format.
The function import()
is used to import all the supported data types into a GRanges
object in R. The function recognises the format from the extension of the file but the argument format
can be used to expliciclty define it. The function export()
works in the same way and it is used to export GRanges
objects to files.
> # Rtracklayer Import/Export with Rtrcaklayer
> gr_S1
## GRanges object with 5 ranges and 0 metadata columns:
## seqnames ranges strand
## <Rle> <IRanges> <Rle>
## Region_1 chr1 [ 5, 11] *
## Region_2 chr1 [ 8, 15] *
## Region_3 chr1 [20, 26] *
## Region_4 chr2 [ 8, 16] *
## Region_5 chr2 [18, 21] *
## -------
## seqinfo: 2 sequences from an unspecified genome; no seqlengths
> export(gr_S1, file.path(dir, "transcriptHg19.gff"))
> export(gr_S1, file.path(dir, "transcriptHg19.gtf"))
>
> importHg19 <- import(file.path(dir, "transcriptHg19.gff"))
> class(importHg19)
## [1] "GRanges"
## attr(,"package")
## [1] "GenomicRanges"
> importHg19
## GRanges object with 5 ranges and 5 metadata columns:
## seqnames ranges strand | source type score
## <Rle> <IRanges> <Rle> | <factor> <factor> <numeric>
## [1] chr1 [ 5, 11] * | rtracklayer sequence_feature <NA>
## [2] chr1 [ 8, 15] * | rtracklayer sequence_feature <NA>
## [3] chr1 [20, 26] * | rtracklayer sequence_feature <NA>
## [4] chr2 [ 8, 16] * | rtracklayer sequence_feature <NA>
## [5] chr2 [18, 21] * | rtracklayer sequence_feature <NA>
## phase group
## <integer> <factor>
## [1] <NA> chr1
## [2] <NA> chr1
## [3] <NA> chr1
## [4] <NA> chr2
## [5] <NA> chr2
## -------
## seqinfo: 2 sequences from an unspecified genome; no seqlengths
- Create a simple
GRanges
object made out of 20 ranges and export it as agff
file.- Read the file back into R.
- Optional: Print the number of ranges of the
GRanges
object just imported and create a histogram of the widths of the ranges.
The WIG data format is used for the disply of dense continuous data, like scores. Every line in the file is an intervals and WIG files only allow equally spaced intervals. For intervals of variable width one should use the bedGraph format. There are two main data formats: the variableStep and the fixedStep. The WIG file is constituted by one or more blocks separated by declaration lines. Let’s go through them with two examples.
The following example is a WIG file with only one block of coordinates. The field chrom is required.
variableStep chrom=chr2
300701 12.5
300702 12.5
300703 12.5
300704 12.5
300705 12.5
The same file can be defined as follows using the span argument.
variableStep chrom=chr2 span=5
300701 12.5
This format allows a more compact way of storing intervals. In this situation the span would not change the dimension of the file.
fixedStep chrom=chr3 start=400601 step=100
11
22
33
The bigWig file is created from a WIG file and in R
this can be achieved through wigToBigWig()
. bigWig is the recommended format for very large data tracks due to the way in which data are compressed and stored in an indexed binary format. Usually whole-genome coverage vectors are stored as bigWig. Howver, the loss is negligible when dealing with very large amount of data.
> ## Wiggle (WIG) and bigWIG Import
> wig_path <- file.path(dir, "exampleWIG.wig")
> import_wig <- import(con = wig_path, seqinfo = Seqinfo(genome = "mm10"))
> import_wig
## GRanges object with 16 ranges and 1 metadata column:
## seqnames ranges strand | score
## <Rle> <IRanges> <Rle> | <numeric>
## [1] chr2 [300701, 300701] * | 12.5
## [2] chr2 [300702, 300702] * | 12.5
## [3] chr2 [300703, 300703] * | 12.5
## [4] chr2 [300704, 300704] * | 12.5
## [5] chr2 [300705, 300705] * | 12.5
## ... ... ... ... . ...
## [12] chr3 [401501, 401501] * | 10.5
## [13] chr4 [500001, 500001] * | 10
## [14] chr4 [500201, 500201] * | 20
## [15] chr4 [500401, 500401] * | 20
## [16] chr4 [500601, 500601] * | 20
## -------
## seqinfo: 66 sequences (1 circular) from mm10 genome
> `?`(Seqinfo)
> seqinfo(import_wig)
## Seqinfo object with 66 sequences (1 circular) from mm10 genome:
## seqnames seqlengths isCircular genome
## chr1 195471971 FALSE mm10
## chr2 182113224 FALSE mm10
## chr3 160039680 FALSE mm10
## chr4 156508116 FALSE mm10
## chr5 151834684 FALSE mm10
## ... ... ... ...
## chrUn_GL456392 23629 FALSE mm10
## chrUn_GL456393 55711 FALSE mm10
## chrUn_GL456394 24323 FALSE mm10
## chrUn_GL456396 21240 FALSE mm10
## chrUn_JH584304 114452 FALSE mm10
Convert the WIG file into a bigWig file and import the bigWig file into R.
> bigwig_path <- file.path(dir, "example_wigToBigWig.bw")
> wigToBigWig(wig_path, dest = bigwig_path, seqinfo = seqinfo(import_wig))
> import_bigwig <- import(con = bigwig_path)
The import
functions allows to load into a R only a specific regions defined using a GRanges
object.
> # Define a region
> which_region <- GRanges(seqnames = "chr3", IRanges(start = 1,
+ end = 5e+05))
> import_bigwig_region <- import(con = bigwig_path, which = which_region)
liftOver
from different genome releasesThis function allows to convert genomic coordinate between difference releases of the same genome or even between different species.
> `?`(AnnotationHub)
> ahub <- AnnotationHub()
> ahub
## AnnotationHub with 37202 records
## # snapshotDate(): 2016-10-11
## # $dataprovider: BroadInstitute, UCSC, Ensembl, ftp://ftp.ncbi.nlm.nih....
## # $species: Homo sapiens, Mus musculus, Bos taurus, Pan troglodytes, Da...
## # $rdataclass: GRanges, BigWigFile, FaFile, TwoBitFile, ChainFile, OrgD...
## # additional mcols(): taxonomyid, genome, description,
## # coordinate_1_based, maintainer, rdatadateadded, preparerclass,
## # tags, sourceurl, sourcetype
## # retrieve records with, e.g., 'object[["AH2"]]'
##
## title
## AH2 | Ailuropoda_melanoleuca.ailMel1.69.dna.toplevel.fa
## AH3 | Ailuropoda_melanoleuca.ailMel1.69.dna_rm.toplevel.fa
## AH4 | Ailuropoda_melanoleuca.ailMel1.69.dna_sm.toplevel.fa
## AH5 | Ailuropoda_melanoleuca.ailMel1.69.ncrna.fa
## AH6 | Ailuropoda_melanoleuca.ailMel1.69.pep.all.fa
## ... ...
## AH51457 | Xiphophorus_maculatus.Xipmac4.4.2.dna_rm.toplevel.2bit
## AH51458 | Xiphophorus_maculatus.Xipmac4.4.2.dna_sm.toplevel.2bit
## AH51459 | Xiphophorus_maculatus.Xipmac4.4.2.dna.toplevel.2bit
## AH51460 | Xiphophorus_maculatus.Xipmac4.4.2.ncrna.2bit
## AH51461 | Alternative Splicing Annotation for Homo sapiens (Human)
> # Chain files are data format used to convert
> # coordinates between genome references
> ahub.chain <- subset(ahub, rdataclass == "ChainFile" &
+ species == "Homo sapiens")
> ahub.chain2 <- subset(ahub, rdataclass == "ChainFile" &
+ species %in% c("Homo sapiens", "Mus musculus"))
> query(ahub.chain, c("hg18", "hg19"))
## AnnotationHub with 2 records
## # snapshotDate(): 2016-10-11
## # $dataprovider: UCSC
## # $species: Homo sapiens
## # $rdataclass: ChainFile
## # additional mcols(): taxonomyid, genome, description,
## # coordinate_1_based, maintainer, rdatadateadded, preparerclass,
## # tags, sourceurl, sourcetype
## # retrieve records with, e.g., 'object[["AH14149"]]'
##
## title
## AH14149 | hg19ToHg18.over.chain.gz
## AH14220 | hg18ToHg19.over.chain.gz
> chain <- ahub.chain[ahub.chain$title == "hg19ToHg18.over.chain.gz"]
> chain <- chain[[1]]
> gr.hg18 <- liftOver(import_bigwig, chain)
> gr.hg18
## GRangesList object of length 16:
## [[1]]
## GRanges object with 1 range and 1 metadata column:
## seqnames ranges strand | score
## <Rle> <IRanges> <Rle> | <numeric>
## [1] chr2 [290701, 290701] * | 12.5
##
## [[2]]
## GRanges object with 1 range and 1 metadata column:
## seqnames ranges strand | score
## [1] chr2 [290702, 290702] * | 12.5
##
## [[3]]
## GRanges object with 1 range and 1 metadata column:
## seqnames ranges strand | score
## [1] chr2 [290703, 290703] * | 12.5
##
## ...
## <13 more elements>
## -------
## seqinfo: 3 sequences from an unspecified genome; no seqlengths
> unlist(gr.hg18)
## GRanges object with 16 ranges and 1 metadata column:
## seqnames ranges strand | score
## <Rle> <IRanges> <Rle> | <numeric>
## [1] chr2 [290701, 290701] * | 12.5
## [2] chr2 [290702, 290702] * | 12.5
## [3] chr2 [290703, 290703] * | 12.5
## [4] chr2 [290704, 290704] * | 12.5
## [5] chr2 [290705, 290705] * | 12.5
## ... ... ... ... . ...
## [12] chr3 [376501, 376650] * | 10.5
## [13] chr4 [490001, 490200] * | 10
## [14] chr4 [490201, 490400] * | 20
## [15] chr4 [490401, 490600] * | 20
## [16] chr4 [490601, 490800] * | 20
## -------
## seqinfo: 3 sequences from an unspecified genome; no seqlengths
Rsamtools
Rsamtools is an R interface for Samtools which offers a large variety of utilities to manipulate SAM and BAM files. The main purose of Rsamtools is to import BAM files into R. This will allow the user to then create objects which can be used for several types of downstream analyses. BAM files are the standard way of storing ‘short’ aligned (and unaligned) reads produced by any type of experiment (RNA-Seq, ChIP-Seq, Methylation data, etc). BAM files are binary versions of SAM files and can be indexed allowing to access only localised regions of the genome, similar to what we have seen for the bigWig file. Each read is stored in a BAM file with several other measures like base quality, position of the 5’ end of the read, read name, etc …. Rsamtools allows the user to specify which parameters of the reads to load into R. A detailed description of the fields in the BAM and their names is available at http://samtools.github.io/hts-specs/SAMv1.pdf. It has to be remembered that BAM files can be very large and there are limits in how much can be done into R and for more complex analyses is preferred to work with Samtools is prefered.
>
+ D00626:239:CAFMCANXX:4:1201:16222:93271 403 chr1 17018 3 38M177N62M = 16965 -330 TGGCCCAGGTCTGGCACATAGAAGTAGTTCTCTGGGACCTGCTGTTCCAGCTGCTCT GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG
+ RG:Z:9_CAFMCANXX_L004 NH:i:2 HI:i:2 NM:i:0 nM:i:1 AS:i:200
A very important field which is worth understanding is the FLAG of the read. This is a binary number which defines the type of read. In the example above it is the number 403. For example, the FLAG defines whether the read is unmapped or mapped or with paired end (PE) reads if the first mate is mapped and the second one is not, and so on. It is not always trivial to decode a FLAG number and the Broad Institute offers a useful online tool to do that for you https://broadinstitute.github.io/picard/explain-flags.html.
R
The function used to load BAM files into R is scanBam()
. The function ScanBamParam()
is used to specify the fields and the region/s of the BAM file to load into R. The genomic region to load is defined through GRanges
. Below is an example on how to load a few regions of 1000 bp width from a BAM file containing reads on chromosome 21 from an RNA-Seq experiment. We also require only uniquely mapped and properly paired reads to be loaded as well as we do not load PCR duplicates. The help page of ?scanBam
lists the name of the fields that can be accessed through the what
argument in the ScanBamParam()
function.
> # Path to the bamfile. In the same folder there has
> # to be the chr1.bam.bai file
> bamfile <- file.path(dir, "chr21.bam")
> # Define parameters
> which <- GRanges(seqnames = rep("chr21", 3), IRanges(start = c(9827000,
+ 16267000, 15890000), width = 10000))
> what <- c("rname", "strand", "pos", "qwidth", "seq",
+ "isize")
> # ?scanBamFlag
> flag = scanBamFlag(isDuplicate = FALSE, isProperPair = TRUE,
+ isPaired = TRUE)
>
> param <- ScanBamParam(which = which, what = what, flag = flag)
> # load reads into R
> reads = scanBam(bamfile, param = param)
>
> # Explore the reads
> names(reads)
## [1] "chr21:9827000-9836999" "chr21:16267000-16276999"
## [3] "chr21:15890000-15899999"
> region1 <- reads$`chr21:9827000-9836999`
> names(region1)
## [1] "rname" "strand" "pos" "qwidth" "isize" "seq"
> head(region1$rname)
## [1] chr21 chr21 chr21 chr21 chr21 chr21
## 25 Levels: chrM chr1 chr2 chr3 chr4 chr5 chr6 chr7 chr8 chr9 ... chrY
> head(region1$pos)
## [1] 9826933 9826933 9826934 9826934 9826934 9826940
> head(region1$qwidth)
## [1] 100 100 100 100 100 100
> head(region1$seq)
## A DNAStringSet instance of length 6
## width seq
## [1] 100 CCGGGCCCGTCCTCGCGAGGCCCCCCGGCCG...TACCTACCTGGTTGATCCTGCCACTAGCATA
## [2] 100 CCGGGCCCGTCCTCGCGAGGCCCCCCGGCCG...TAACTACCTGGTTGATTCTGCCAGTAGCATA
## [3] 100 CGGGCCCGTCCTCGCGAGGCCCCCCGGCCGG...ACCTACCTGGTTGATCCTGCCAGTAGCATAT
## [4] 100 CGGGCCCGTCCTCGCGAGGCCCCCCGGCCGG...ACCTACCTGGTTGATCCTGCCAGTAGCATAT
## [5] 100 CGGGCCCGTCCTCGCGAGGCCCCCCGGCCGG...ACCTACCTGGTTGATCCTGCCAGTAGCATAT
## [6] 100 CGTCCTCGCGAGGCCCCCCGGCCGGCCGTCC...CTGGTTGATCCTGCCAGTAGCATTTGCTTGT
> head(region1$isize)
## [1] 146 144 162 162 145 158
readGAlignments
also loads BAM files into R directly into a GRanges
object. The parameters are specified as in RSamtools
with the argument param
.
> # GenomicAlignments
> readsGA <- readGAlignments(bamfile, param = param)
- Define a new
ScanBamParam()
object that satisfies the following options: use the same ranges as above, load both main alignments and PCR duplicates, properly paired and uniquely mapped reads.- Plot an histogram of the fragment sizes of every read (Suggestion: look for the
isize
fields)
It is often useful to summarise the number of reads falling within a bin to have an idea about the distribution of the reads across a region. There are several ways in which this can be accomplished and here two will be shown. Often, a read is counted in a bin if its 5’ end falls within bin. This is the approach that we will consider.
> ## Compute number of reads that falls within 100bp
> ## bins
> bins <- GRanges(seqnames = rep("chr21", 11), IRanges(start = seq(9827000,
+ 9827000 + 10000, by = 1000), width = 1000))
> # Create vector of 0-1 positions
> positions <- region1$pos
> vector_positions <- rep(0, max(positions))
> vector_positions[positions] <- 1
> # Views Object
> rangeViews <- Views(vector_positions, start = seq(min(positions),
+ max(positions), length.out = 10), width = 60)
> # viewSums
> bin_sum <- viewSums(rangeViews)
> # Add extra column to the ranges object
> binned_counts <- ranges(rangeViews)
> values(binned_counts) <- data.frame(ReadCounts = bin_sum)