Chunks options

library(knitr)
knitr::opts_chunk$set(fig.width=8, 
    fig.height=6, echo=T, 
    warning=FALSE, message=FALSE,
    prompt=T,tidy=T,tidy.opts=list(width.cutoff=50),
    include=TRUE,cache=TRUE)
# To output a pdf or html document simply substitute github_document with pdf_document or html_document in the header of the file

# Working directory - To be changed once the repository is cloned
dir <- "."

Install packages

> # Bioconductor packages
> source("https://bioconductor.org/biocLite.R")
> biocLite(c("GenomicRanges", "rtracklayer", "Rsamtools", 
+     "GenomicAlignments", "TxDb.Hsapiens.UCSC.hg19.knownGene", 
+     "AnnotationHub"))
> 
> # CRAN packages
> install.packages("ggplot2")
> install.packages("knitr")

Load packages

> # Load packages
> library("ggplot2")
> library("GenomicRanges")
> library("Rsamtools")
> library("GenomicAlignments")
> library("rtracklayer")
> library("TxDb.Hsapiens.UCSC.hg19.knownGene")
> library("AnnotationHub")
> library("knitr")

Advanced GenomicRanges

Imagine you have done an experiment on Sample 1 and your output is a set of genomic ranges. Let’s start by manually creating a very simple GRanges objects with ranges on chr1 and chr2.

> # Create and example: Sample 1
> gr_S1 <- GRanges(seqnames = rep(c("chr1", "chr2"), 
+     times = c(3, 2)), ranges = IRanges(start = c(5, 
+     8, 20, 8, 18), end = c(11, 15, 26, 16, 21), names = c(paste("Range_", 
+     1:5, sep = ""))), strand = rep(strand(c("*")), 
+     times = 5))
> 
> gr_S1
## GRanges object with 5 ranges and 0 metadata columns:
##           seqnames    ranges strand
##              <Rle> <IRanges>  <Rle>
##   Range_1     chr1  [ 5, 11]      *
##   Range_2     chr1  [ 8, 15]      *
##   Range_3     chr1  [20, 26]      *
##   Range_4     chr2  [ 8, 16]      *
##   Range_5     chr2  [18, 21]      *
##   -------
##   seqinfo: 2 sequences from an unspecified genome; no seqlengths

A very common analysis to perform is to evaluate to what extent and where your ranges overlap with some features, such as genes, exons, etc… As an example we create a simple fake gene annotation to use with the ranges created above.

> # Gene annotation
> genes <- GRanges(seqnames = rep(c("chr1", "chr2"), 
+     times = c(2, 2)), ranges = IRanges(start = c(7, 
+     17, 7, 23), end = c(15, 23, 14, 26), names = c(paste("Gene_", 
+     1:4, sep = ""))), strand = rep(strand(c("+")), 
+     times = 4))
> 
> genes
## GRanges object with 4 ranges and 0 metadata columns:
##          seqnames    ranges strand
##             <Rle> <IRanges>  <Rle>
##   Gene_1     chr1  [ 7, 15]      +
##   Gene_2     chr1  [17, 23]      +
##   Gene_3     chr2  [ 7, 14]      +
##   Gene_4     chr2  [23, 26]      +
##   -------
##   seqinfo: 2 sequences from an unspecified genome; no seqlengths

Figure 1 is a simple way of plotting ranges stored into Granges object using geom_rect() from ggplot2. More advanced plotting options are available but this simple strategy is sufficient for now and also every excuse is good to use ggplot!

> # Plot ranges with ggplot2
> gr <- as.data.frame(rbind(as.data.frame(gr_S1), as.data.frame(genes)))
> gr$rangeID <- c(names(gr_S1), names(genes))
> gr$Sample <- c(rep("Peaks", length(gr_S1)), rep("Genes", 
+     length(genes)))
> 
> ggplot(data = gr, aes(xmin = start, xmax = end, ymin = 0, 
+     ymax = 1)) + geom_rect(aes(fill = rangeID), alpha = 0.4) + 
+     facet_wrap(~Sample + seqnames) + theme_bw() + labs(x = "Genomic position", 
+     y = "Ranges") + theme(axis.ticks.y = element_blank(), 
+     axis.text.y = element_blank())
Graphic representation of the GRanges objects created above.

Graphic representation of the GRanges objects created above.

Overlaps between two GRanges objects

findOverlaps()

By default looks for overlaps of a minimum of 1bp between a query and a subject.

> ### `findOverlaps()`
> `?`(findOverlaps)
## Help on topic 'findOverlaps' was found in the following packages:
## 
##   Package               Library
##   SummarizedExperiment
##                       /wehisan/general/system/bioinf-software/bioinfsoftware/R/R-3.3.0/lib64/R/library
##   IRanges               /wehisan/general/system/bioinf-software/bioinfsoftware/R/R-3.3.0/lib64/R/library
##   GenomicRanges         /wehisan/home/allstaff/q/quaglieri.a/R/x86_64-pc-linux-gnu-library/3.3
##   GenomicAlignments     /wehisan/general/system/bioinf-software/bioinfsoftware/R/R-3.3.0/lib64/R/library
## 
## 
## Using the first match ...
> overlaps <- findOverlaps(query = gr_S1, subject = genes)
> overlaps
## Hits object with 4 hits and 0 metadata columns:
##       queryHits subjectHits
##       <integer>   <integer>
##   [1]         1           1
##   [2]         2           1
##   [3]         3           2
##   [4]         4           3
##   -------
##   queryLength: 5 / subjectLength: 4
> # Query the output
> queryHits(overlaps)
## [1] 1 2 3 4
> subjectHits(overlaps)
## [1] 1 1 2 3
> subjectLength(overlaps)
## [1] 4
> # You can allow for a gap to be ignored
> overlaps <- findOverlaps(gr_S1, genes, maxgap = 5)
> overlaps
## Hits object with 8 hits and 0 metadata columns:
##       queryHits subjectHits
##       <integer>   <integer>
##   [1]         1           1
##   [2]         2           1
##   [3]         2           2
##   [4]         3           1
##   [5]         3           2
##   [6]         4           3
##   [7]         5           3
##   [8]         5           4
##   -------
##   queryLength: 5 / subjectLength: 4
> # You can specify a minimum overlap
> overlaps <- findOverlaps(gr_S1, genes, minoverlap = 5)
> overlaps
## Hits object with 3 hits and 0 metadata columns:
##       queryHits subjectHits
##       <integer>   <integer>
##   [1]         1           1
##   [2]         2           1
##   [3]         4           3
##   -------
##   queryLength: 5 / subjectLength: 4
> # You can specify a minimum overlap
> overlaps <- findOverlaps(gr_S1, genes, type = "start")
> overlaps
## Hits object with 0 hits and 0 metadata columns:
##    queryHits subjectHits
##    <integer>   <integer>
##   -------
##   queryLength: 5 / subjectLength: 4
> overlaps <- findOverlaps(gr_S1, genes, type = "end")
> overlaps
## Hits object with 1 hit and 0 metadata columns:
##       queryHits subjectHits
##       <integer>   <integer>
##   [1]         2           1
##   -------
##   queryLength: 5 / subjectLength: 4
> overlaps <- findOverlaps(gr_S1, genes, type = "within")
> overlaps
## Hits object with 1 hit and 0 metadata columns:
##       queryHits subjectHits
##       <integer>   <integer>
##   [1]         2           1
##   -------
##   queryLength: 5 / subjectLength: 4

countOverlaps()

> ## `countOverlaps()`
> `?`(countOverlaps)
## Help on topic 'countOverlaps' was found in the following packages:
## 
##   Package               Library
##   IRanges               /wehisan/general/system/bioinf-software/bioinfsoftware/R/R-3.3.0/lib64/R/library
##   GenomicRanges         /wehisan/home/allstaff/q/quaglieri.a/R/x86_64-pc-linux-gnu-library/3.3
## 
## 
## Using the first match ...
> N_overlaps <- countOverlaps(gr_S1, genes)
> N_overlaps
## Range_1 Range_2 Range_3 Range_4 Range_5 
##       1       1       1       1       0
> # You can play around with the same options as
> # findOverlaps()

Nearest-methods in GenomicRanges

nearest()

> ## Nearest-methods in `GenomicRanges` `nearest()`
> `?`(nearest)
## Help on topic 'nearest' was found in the following packages:
## 
##   Package               Library
##   SummarizedExperiment
##                       /wehisan/general/system/bioinf-software/bioinfsoftware/R/R-3.3.0/lib64/R/library
##   IRanges               /wehisan/general/system/bioinf-software/bioinfsoftware/R/R-3.3.0/lib64/R/library
##   GenomicRanges         /wehisan/home/allstaff/q/quaglieri.a/R/x86_64-pc-linux-gnu-library/3.3
## 
## 
## Using the first match ...

It returns a vector of indeces referring to the nearest neighbour range in subject for every range in x. By default if one range overlaps with multiple genes then one overlap will be chosen at random:

> nearest(x = gr_S1, subject = genes)
## [1] 1 1 2 3 4
> nearest(x = gr_S1, subject = genes, select = "all")
## Hits object with 5 hits and 0 metadata columns:
##       queryHits subjectHits
##       <integer>   <integer>
##   [1]         1           1
##   [2]         2           1
##   [3]         3           2
##   [4]         4           3
##   [5]         5           4
##   -------
##   queryLength: 5 / subjectLength: 4
> nearest(gr_S1)
## [1] 2 1 2 5 4
> nearest(gr_S1, gr_S1)
## [1] 1 1 3 4 5

distance()

> ### `distance()`
> `?`(distance)
## Help on topic 'distance' was found in the following packages:
## 
##   Package               Library
##   SummarizedExperiment
##                       /wehisan/general/system/bioinf-software/bioinfsoftware/R/R-3.3.0/lib64/R/library
##   IRanges               /wehisan/general/system/bioinf-software/bioinfsoftware/R/R-3.3.0/lib64/R/library
##   GenomicRanges         /wehisan/home/allstaff/q/quaglieri.a/R/x86_64-pc-linux-gnu-library/3.3
## 
## 
## Using the first match ...
> distance(x = gr_S1[1], y = genes)
## [1]  0  5 NA NA
> distance(x = gr_S1[1:4], y = genes)
## [1]  0  1 NA  6
> distance(x = gr_S1, y = genes)
## [1]  0  1 NA  6 NA

distance() is a symmetric function which means that it requires x and y to have to have the same lenght and if one is shorter than the other one it will be recycled to match the length of the longest. Also, the distance between two consecutive blocks is 0 not 1 which affects the notion of overlaps. If distance(x, y) == 0 then x and y can be either adjacent or overlapping ranges.

distanceToNearest()

> ### `distanceToNearest()`
> `?`(distanceToNearest)
## Help on topic 'distanceToNearest' was found in the following
## packages:
## 
##   Package               Library
##   SummarizedExperiment
##                       /wehisan/general/system/bioinf-software/bioinfsoftware/R/R-3.3.0/lib64/R/library
##   IRanges               /wehisan/general/system/bioinf-software/bioinfsoftware/R/R-3.3.0/lib64/R/library
##   GenomicRanges         /wehisan/home/allstaff/q/quaglieri.a/R/x86_64-pc-linux-gnu-library/3.3
## 
## 
## Using the first match ...

For every range in x it will return the index and the distance to its nearest neighbour in subject.

> distanceToNearest(x = gr_S1, subject = genes)
## Hits object with 5 hits and 1 metadata column:
##       queryHits subjectHits |  distance
##       <integer>   <integer> | <integer>
##   [1]         1           1 |         0
##   [2]         2           1 |         0
##   [3]         3           2 |         0
##   [4]         4           3 |         0
##   [5]         5           4 |         1
##   -------
##   queryLength: 5 / subjectLength: 4

GRangesList

GRangesList are lists of GRanges objects.

> # Create a GRangesList Sample 1
> gr_S1 <- GRanges(seqnames = rep(c("chr1", "chr2"), 
+     times = c(3, 2)), ranges = IRanges(start = c(5, 
+     8, 20, 8, 18), end = c(11, 15, 26, 16, 21), names = c(paste("Region_", 
+     1:5, sep = ""))), strand = rep("*", times = 5))
> 
> gr_S1
## GRanges object with 5 ranges and 0 metadata columns:
##            seqnames    ranges strand
##               <Rle> <IRanges>  <Rle>
##   Region_1     chr1  [ 5, 11]      *
##   Region_2     chr1  [ 8, 15]      *
##   Region_3     chr1  [20, 26]      *
##   Region_4     chr2  [ 8, 16]      *
##   Region_5     chr2  [18, 21]      *
##   -------
##   seqinfo: 2 sequences from an unspecified genome; no seqlengths
> # Sample 2
> gr_S2 <- GRanges(seqnames = rep(c("chr2", "chr3"), 
+     times = c(3, 5)), ranges = IRanges(start = c(1:8), 
+     width = 10, names = c(paste("Region_", 1:8, sep = ""))), 
+     strand = rep("*", times = 8))
> 
> gr_S2
## GRanges object with 8 ranges and 0 metadata columns:
##            seqnames    ranges strand
##               <Rle> <IRanges>  <Rle>
##   Region_1     chr2   [1, 10]      *
##   Region_2     chr2   [2, 11]      *
##   Region_3     chr2   [3, 12]      *
##   Region_4     chr3   [4, 13]      *
##   Region_5     chr3   [5, 14]      *
##   Region_6     chr3   [6, 15]      *
##   Region_7     chr3   [7, 16]      *
##   Region_8     chr3   [8, 17]      *
##   -------
##   seqinfo: 2 sequences from an unspecified genome; no seqlengths
> # GRanges List
> list_ranges <- GRangesList(Sample1 = gr_S1, Sample2 = gr_S2)
> length(list)
## [1] 1

Many of the functions learnt for GRanges can also be applied to GRangesList objects even though the output will have to be interepreted accordingly:

> names(list_ranges)
## [1] "Sample1" "Sample2"
> length(list_ranges)
## [1] 2
> seqnames(list_ranges)
## RleList of length 2
## $Sample1
## factor-Rle of length 5 with 2 runs
##   Lengths:    3    2
##   Values : chr1 chr2
## Levels(3): chr1 chr2 chr3
## 
## $Sample2
## factor-Rle of length 8 with 2 runs
##   Lengths:    3    5
##   Values : chr2 chr3
## Levels(3): chr1 chr2 chr3
> strand(list_ranges)
## RleList of length 2
## $Sample1
## factor-Rle of length 5 with 1 run
##   Lengths: 5
##   Values : *
## Levels(3): + - *
## 
## $Sample2
## factor-Rle of length 8 with 1 run
##   Lengths: 8
##   Values : *
## Levels(3): + - *
> ranges(list_ranges)
## IRangesList of length 2
## $Sample1
## IRanges object with 5 ranges and 0 metadata columns:
##                start       end     width
##            <integer> <integer> <integer>
##   Region_1         5        11         7
##   Region_2         8        15         8
##   Region_3        20        26         7
##   Region_4         8        16         9
##   Region_5        18        21         4
## 
## $Sample2
## IRanges object with 8 ranges and 0 metadata columns:
##                start       end     width
##            <integer> <integer> <integer>
##   Region_1         1        10        10
##   Region_2         2        11        10
##   Region_3         3        12        10
##   Region_4         4        13        10
##   Region_5         5        14        10
##   Region_6         6        15        10
##   Region_7         7        16        10
##   Region_8         8        17        10
> start(list_ranges)
## IntegerList of length 2
## [["Sample1"]] 5 8 20 8 18
## [["Sample2"]] 1 2 3 4 5 6 7 8
> end(list_ranges)
## IntegerList of length 2
## [["Sample1"]] 11 15 26 16 21
## [["Sample2"]] 10 11 12 13 14 15 16 17
> width(list_ranges)
## IntegerList of length 2
## [["Sample1"]] 7 8 7 9 4
## [["Sample2"]] 10 10 10 10 10 10 10 10
> unlist(list_ranges)
## GRanges object with 13 ranges and 0 metadata columns:
##                    seqnames    ranges strand
##                       <Rle> <IRanges>  <Rle>
##   Sample1.Region_1     chr1  [ 5, 11]      *
##   Sample1.Region_2     chr1  [ 8, 15]      *
##   Sample1.Region_3     chr1  [20, 26]      *
##   Sample1.Region_4     chr2  [ 8, 16]      *
##   Sample1.Region_5     chr2  [18, 21]      *
##                ...      ...       ...    ...
##   Sample2.Region_4     chr3   [4, 13]      *
##   Sample2.Region_5     chr3   [5, 14]      *
##   Sample2.Region_6     chr3   [6, 15]      *
##   Sample2.Region_7     chr3   [7, 16]      *
##   Sample2.Region_8     chr3   [8, 17]      *
##   -------
##   seqinfo: 3 sequences from an unspecified genome; no seqlengths

To get the number of ranges in every object of the list use elementNROWS:

> elementNROWS(list_ranges)
## Sample1 Sample2 
##       5       8

Subsetting and looping over Granges list

Add an extra column to every GRanges in the GRangesList.

> addCols <- lapply(list_ranges, function(x) {
+     elementMetadata(x) <- data.frame(NumberReads = rbinom(length(x), 
+         size = 100, prob = 0.5))
+     return(x)
+ })
> class(addCols)
## [1] "list"
> addCols <- GRangesList(addCols)
> class(addCols)
## [1] "GRangesList"
## attr(,"package")
## [1] "GenomicRanges"

In many cases Granges objects can be subsetted using the same rules that apply to normal lists or data frames in R:

> # Subsetting As a list
> addCols[[1]]
## GRanges object with 5 ranges and 1 metadata column:
##            seqnames    ranges strand | NumberReads
##               <Rle> <IRanges>  <Rle> |   <integer>
##   Region_1     chr1  [ 5, 11]      * |          45
##   Region_2     chr1  [ 8, 15]      * |          61
##   Region_3     chr1  [20, 26]      * |          49
##   Region_4     chr2  [ 8, 16]      * |          57
##   Region_5     chr2  [18, 21]      * |          58
##   -------
##   seqinfo: 3 sequences from an unspecified genome; no seqlengths
> addCols["Sample1"]
## GRangesList object of length 1:
## $Sample1 
## GRanges object with 5 ranges and 1 metadata column:
##            seqnames    ranges strand | NumberReads
##               <Rle> <IRanges>  <Rle> |   <integer>
##   Region_1     chr1  [ 5, 11]      * |          45
##   Region_2     chr1  [ 8, 15]      * |          61
##   Region_3     chr1  [20, 26]      * |          49
##   Region_4     chr2  [ 8, 16]      * |          57
##   Region_5     chr2  [18, 21]      * |          58
## 
## -------
## seqinfo: 3 sequences from an unspecified genome; no seqlengths
> # As a data.frame object
> addCols[1, "NumberReads"]
## GRangesList object of length 1:
## $Sample1 
## GRanges object with 5 ranges and 1 metadata column:
##            seqnames    ranges strand | NumberReads
##               <Rle> <IRanges>  <Rle> |   <integer>
##   Region_1     chr1  [ 5, 11]      * |          45
##   Region_2     chr1  [ 8, 15]      * |          61
##   Region_3     chr1  [20, 26]      * |          49
##   Region_4     chr2  [ 8, 16]      * |          57
##   Region_5     chr2  [18, 21]      * |          58
## 
## -------
## seqinfo: 3 sequences from an unspecified genome; no seqlengths
> addCols["Sample1", "NumberReads"]
## GRangesList object of length 1:
## $Sample1 
## GRanges object with 5 ranges and 1 metadata column:
##            seqnames    ranges strand | NumberReads
##               <Rle> <IRanges>  <Rle> |   <integer>
##   Region_1     chr1  [ 5, 11]      * |          45
##   Region_2     chr1  [ 8, 15]      * |          61
##   Region_3     chr1  [20, 26]      * |          49
##   Region_4     chr2  [ 8, 16]      * |          57
##   Region_5     chr2  [18, 21]      * |          58
## 
## -------
## seqinfo: 3 sequences from an unspecified genome; no seqlengths
> # Looping
> lapply(addCols, length)
## $Sample1
## [1] 5
## 
## $Sample2
## [1] 8
> sapply(addCols, length)
## Sample1 Sample2 
##       5       8

Similar considerations apply for the other functions explored above like findOverlaps(), countOverlaps(), etc…

Challenge 2

Use the code below to download the cpg_islands object containing the CpG islands for the Human chrY and the txdb gene annotation from TxDb.Hsapiens.UCSC.hg19.knownGene.

> # Download CpG islands from UCSC genome browser in
> # the same way you can extract many more genomic
> # object
> session <- browserSession("UCSC")
> query <- ucscTableQuery(session, "CpG Islands", GRangesForUCSCGenome("hg19", 
+     "chrY"))
> cpg_islands <- getTable(query)
> # Download genes
> txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
  1. What is the class of cpg_islands? Convert cpg_islands to a GRanges object.
  2. Using what you have previosly learnt, get the transcript for every gene from the txdb gene annotation and store them into the transcriptHg19 object (suggestion: see transcriptsBy).
  3. Subset transcriptHg19 to extract only the transcript on chr21 (suggestion: first run unlist(transcriptHg19)).
  4. Find the nearest CpG island in cpg_islandsGR to every transcript in transcriptHg19 as well as their distance.
  5. Optional: On average, every CpG island is close to how many genes?

Rtracklayer

Now that the structure and manipulation of GRanges objects is clear, the rtracklayer package comes as a useful tool to import/export GRanges objects from and to different data formats commonly used in genomic analyses. Rtracklayer stands as an interface to mediate the crosstalk between R and genome browsers (UCSC built-in). Example of data files supported by this package are BED, bigWIG, wig, gff and gtf formats.

Another very important function of this package is to allow the visualisations of genomic annotation tracks. However, this part will not covered in the present tutorial. For more references see the Rtracklayer Bioconductor page.

There is a very large number of different data formats used to store different sort of genomic data. In this tutororial only few of them will be covered as example. However, the UCSC genome growser offers a useful FAQ page where one can find the specification of all mandatory and optional fields for every data format.

GTF and GFF

The GFF and GTF are the preferred format used to store annotations and their exact specification is summarised in GFF/GTF File Format.

The function import() is used to import all the supported data types into a GRanges object in R. The function recognises the format from the extension of the file but the argument format can be used to expliciclty define it. The function export() works in the same way and it is used to export GRanges objects to files.

> # Rtracklayer Import/Export with Rtrcaklayer
> gr_S1
## GRanges object with 5 ranges and 0 metadata columns:
##            seqnames    ranges strand
##               <Rle> <IRanges>  <Rle>
##   Region_1     chr1  [ 5, 11]      *
##   Region_2     chr1  [ 8, 15]      *
##   Region_3     chr1  [20, 26]      *
##   Region_4     chr2  [ 8, 16]      *
##   Region_5     chr2  [18, 21]      *
##   -------
##   seqinfo: 2 sequences from an unspecified genome; no seqlengths
> export(gr_S1, file.path(dir, "transcriptHg19.gff"))
> export(gr_S1, file.path(dir, "transcriptHg19.gtf"))
> 
> importHg19 <- import(file.path(dir, "transcriptHg19.gff"))
> class(importHg19)
## [1] "GRanges"
## attr(,"package")
## [1] "GenomicRanges"
> importHg19
## GRanges object with 5 ranges and 5 metadata columns:
##       seqnames    ranges strand |      source             type     score
##          <Rle> <IRanges>  <Rle> |    <factor>         <factor> <numeric>
##   [1]     chr1  [ 5, 11]      * | rtracklayer sequence_feature      <NA>
##   [2]     chr1  [ 8, 15]      * | rtracklayer sequence_feature      <NA>
##   [3]     chr1  [20, 26]      * | rtracklayer sequence_feature      <NA>
##   [4]     chr2  [ 8, 16]      * | rtracklayer sequence_feature      <NA>
##   [5]     chr2  [18, 21]      * | rtracklayer sequence_feature      <NA>
##           phase    group
##       <integer> <factor>
##   [1]      <NA>     chr1
##   [2]      <NA>     chr1
##   [3]      <NA>     chr1
##   [4]      <NA>     chr2
##   [5]      <NA>     chr2
##   -------
##   seqinfo: 2 sequences from an unspecified genome; no seqlengths

Challenge 1

  1. Create a simple GRanges object made out of 20 ranges and export it as a gff file.
  2. Read the file back into R.
  3. Optional: Print the number of ranges of the GRanges object just imported and create a histogram of the widths of the ranges.

Wiggle (WIG) and bigWIG file formats for graphing tracks

The WIG data format is used for the disply of dense continuous data, like scores. Every line in the file is an intervals and WIG files only allow equally spaced intervals. For intervals of variable width one should use the bedGraph format. There are two main data formats: the variableStep and the fixedStep. The WIG file is constituted by one or more blocks separated by declaration lines. Let’s go through them with two examples.

  • variableStep

The following example is a WIG file with only one block of coordinates. The field chrom is required.

variableStep chrom=chr2
300701  12.5
300702  12.5
300703  12.5
300704  12.5
300705  12.5

The same file can be defined as follows using the span argument.

variableStep chrom=chr2 span=5
300701  12.5
  • fixedStep

This format allows a more compact way of storing intervals. In this situation the span would not change the dimension of the file.

fixedStep chrom=chr3 start=400601 step=100
11
22
33

The bigWig file is created from a WIG file and in R this can be achieved through wigToBigWig(). bigWig is the recommended format for very large data tracks due to the way in which data are compressed and stored in an indexed binary format. Usually whole-genome coverage vectors are stored as bigWig. Howver, the loss is negligible when dealing with very large amount of data.

> ## Wiggle (WIG) and bigWIG Import
> wig_path <- file.path(dir, "exampleWIG.wig")
> import_wig <- import(con = wig_path, seqinfo = Seqinfo(genome = "mm10"))
> import_wig
## GRanges object with 16 ranges and 1 metadata column:
##        seqnames           ranges strand |     score
##           <Rle>        <IRanges>  <Rle> | <numeric>
##    [1]     chr2 [300701, 300701]      * |      12.5
##    [2]     chr2 [300702, 300702]      * |      12.5
##    [3]     chr2 [300703, 300703]      * |      12.5
##    [4]     chr2 [300704, 300704]      * |      12.5
##    [5]     chr2 [300705, 300705]      * |      12.5
##    ...      ...              ...    ... .       ...
##   [12]     chr3 [401501, 401501]      * |      10.5
##   [13]     chr4 [500001, 500001]      * |        10
##   [14]     chr4 [500201, 500201]      * |        20
##   [15]     chr4 [500401, 500401]      * |        20
##   [16]     chr4 [500601, 500601]      * |        20
##   -------
##   seqinfo: 66 sequences (1 circular) from mm10 genome
> `?`(Seqinfo)
> seqinfo(import_wig)
## Seqinfo object with 66 sequences (1 circular) from mm10 genome:
##   seqnames       seqlengths isCircular genome
##   chr1            195471971      FALSE   mm10
##   chr2            182113224      FALSE   mm10
##   chr3            160039680      FALSE   mm10
##   chr4            156508116      FALSE   mm10
##   chr5            151834684      FALSE   mm10
##   ...                   ...        ...    ...
##   chrUn_GL456392      23629      FALSE   mm10
##   chrUn_GL456393      55711      FALSE   mm10
##   chrUn_GL456394      24323      FALSE   mm10
##   chrUn_GL456396      21240      FALSE   mm10
##   chrUn_JH584304     114452      FALSE   mm10

Convert the WIG file into a bigWig file and import the bigWig file into R.

> bigwig_path <- file.path(dir, "example_wigToBigWig.bw")
> wigToBigWig(wig_path, dest = bigwig_path, seqinfo = seqinfo(import_wig))
> import_bigwig <- import(con = bigwig_path)

Import only a region of the bigWig file

The import functions allows to load into a R only a specific regions defined using a GRanges object.

> # Define a region
> which_region <- GRanges(seqnames = "chr3", IRanges(start = 1, 
+     end = 5e+05))
> import_bigwig_region <- import(con = bigwig_path, which = which_region)

liftOver from different genome releases

This function allows to convert genomic coordinate between difference releases of the same genome or even between different species.

> `?`(AnnotationHub)
> ahub <- AnnotationHub()
> ahub
## AnnotationHub with 37202 records
## # snapshotDate(): 2016-10-11 
## # $dataprovider: BroadInstitute, UCSC, Ensembl, ftp://ftp.ncbi.nlm.nih....
## # $species: Homo sapiens, Mus musculus, Bos taurus, Pan troglodytes, Da...
## # $rdataclass: GRanges, BigWigFile, FaFile, TwoBitFile, ChainFile, OrgD...
## # additional mcols(): taxonomyid, genome, description,
## #   coordinate_1_based, maintainer, rdatadateadded, preparerclass,
## #   tags, sourceurl, sourcetype 
## # retrieve records with, e.g., 'object[["AH2"]]' 
## 
##             title                                                   
##   AH2     | Ailuropoda_melanoleuca.ailMel1.69.dna.toplevel.fa       
##   AH3     | Ailuropoda_melanoleuca.ailMel1.69.dna_rm.toplevel.fa    
##   AH4     | Ailuropoda_melanoleuca.ailMel1.69.dna_sm.toplevel.fa    
##   AH5     | Ailuropoda_melanoleuca.ailMel1.69.ncrna.fa              
##   AH6     | Ailuropoda_melanoleuca.ailMel1.69.pep.all.fa            
##   ...       ...                                                     
##   AH51457 | Xiphophorus_maculatus.Xipmac4.4.2.dna_rm.toplevel.2bit  
##   AH51458 | Xiphophorus_maculatus.Xipmac4.4.2.dna_sm.toplevel.2bit  
##   AH51459 | Xiphophorus_maculatus.Xipmac4.4.2.dna.toplevel.2bit     
##   AH51460 | Xiphophorus_maculatus.Xipmac4.4.2.ncrna.2bit            
##   AH51461 | Alternative Splicing Annotation for Homo sapiens (Human)
> # Chain files are data format used to convert
> # coordinates between genome references
> ahub.chain <- subset(ahub, rdataclass == "ChainFile" & 
+     species == "Homo sapiens")
> ahub.chain2 <- subset(ahub, rdataclass == "ChainFile" & 
+     species %in% c("Homo sapiens", "Mus musculus"))
> query(ahub.chain, c("hg18", "hg19"))
## AnnotationHub with 2 records
## # snapshotDate(): 2016-10-11 
## # $dataprovider: UCSC
## # $species: Homo sapiens
## # $rdataclass: ChainFile
## # additional mcols(): taxonomyid, genome, description,
## #   coordinate_1_based, maintainer, rdatadateadded, preparerclass,
## #   tags, sourceurl, sourcetype 
## # retrieve records with, e.g., 'object[["AH14149"]]' 
## 
##             title                   
##   AH14149 | hg19ToHg18.over.chain.gz
##   AH14220 | hg18ToHg19.over.chain.gz
> chain <- ahub.chain[ahub.chain$title == "hg19ToHg18.over.chain.gz"]
> chain <- chain[[1]]
> gr.hg18 <- liftOver(import_bigwig, chain)
> gr.hg18
## GRangesList object of length 16:
## [[1]] 
## GRanges object with 1 range and 1 metadata column:
##       seqnames           ranges strand |     score
##          <Rle>        <IRanges>  <Rle> | <numeric>
##   [1]     chr2 [290701, 290701]      * |      12.5
## 
## [[2]] 
## GRanges object with 1 range and 1 metadata column:
##       seqnames           ranges strand | score
##   [1]     chr2 [290702, 290702]      * |  12.5
## 
## [[3]] 
## GRanges object with 1 range and 1 metadata column:
##       seqnames           ranges strand | score
##   [1]     chr2 [290703, 290703]      * |  12.5
## 
## ...
## <13 more elements>
## -------
## seqinfo: 3 sequences from an unspecified genome; no seqlengths
> unlist(gr.hg18)
## GRanges object with 16 ranges and 1 metadata column:
##        seqnames           ranges strand |     score
##           <Rle>        <IRanges>  <Rle> | <numeric>
##    [1]     chr2 [290701, 290701]      * |      12.5
##    [2]     chr2 [290702, 290702]      * |      12.5
##    [3]     chr2 [290703, 290703]      * |      12.5
##    [4]     chr2 [290704, 290704]      * |      12.5
##    [5]     chr2 [290705, 290705]      * |      12.5
##    ...      ...              ...    ... .       ...
##   [12]     chr3 [376501, 376650]      * |      10.5
##   [13]     chr4 [490001, 490200]      * |        10
##   [14]     chr4 [490201, 490400]      * |        20
##   [15]     chr4 [490401, 490600]      * |        20
##   [16]     chr4 [490601, 490800]      * |        20
##   -------
##   seqinfo: 3 sequences from an unspecified genome; no seqlengths

Rsamtools

Rsamtools is an R interface for Samtools which offers a large variety of utilities to manipulate SAM and BAM files. The main purose of Rsamtools is to import BAM files into R. This will allow the user to then create objects which can be used for several types of downstream analyses. BAM files are the standard way of storing ‘short’ aligned (and unaligned) reads produced by any type of experiment (RNA-Seq, ChIP-Seq, Methylation data, etc). BAM files are binary versions of SAM files and can be indexed allowing to access only localised regions of the genome, similar to what we have seen for the bigWig file. Each read is stored in a BAM file with several other measures like base quality, position of the 5’ end of the read, read name, etc …. Rsamtools allows the user to specify which parameters of the reads to load into R. A detailed description of the fields in the BAM and their names is available at http://samtools.github.io/hts-specs/SAMv1.pdf. It has to be remembered that BAM files can be very large and there are limits in how much can be done into R and for more complex analyses is preferred to work with Samtools is prefered.

> 
+ D00626:239:CAFMCANXX:4:1201:16222:93271 403  chr1    17018   3  38M177N62M      =       16965   -330 TGGCCCAGGTCTGGCACATAGAAGTAGTTCTCTGGGACCTGCTGTTCCAGCTGCTCT  GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG
+ RG:Z:9_CAFMCANXX_L004   NH:i:2  HI:i:2  NM:i:0  nM:i:1  AS:i:200

A very important field which is worth understanding is the FLAG of the read. This is a binary number which defines the type of read. In the example above it is the number 403. For example, the FLAG defines whether the read is unmapped or mapped or with paired end (PE) reads if the first mate is mapped and the second one is not, and so on. It is not always trivial to decode a FLAG number and the Broad Institute offers a useful online tool to do that for you https://broadinstitute.github.io/picard/explain-flags.html.

Input BAM into R

The function used to load BAM files into R is scanBam(). The function ScanBamParam() is used to specify the fields and the region/s of the BAM file to load into R. The genomic region to load is defined through GRanges. Below is an example on how to load a few regions of 1000 bp width from a BAM file containing reads on chromosome 21 from an RNA-Seq experiment. We also require only uniquely mapped and properly paired reads to be loaded as well as we do not load PCR duplicates. The help page of ?scanBam lists the name of the fields that can be accessed through the what argument in the ScanBamParam() function.

> # Path to the bamfile. In the same folder there has
> # to be the chr1.bam.bai file
> bamfile <- file.path(dir, "chr21.bam")
> # Define parameters
> which <- GRanges(seqnames = rep("chr21", 3), IRanges(start = c(9827000, 
+     16267000, 15890000), width = 10000))
> what <- c("rname", "strand", "pos", "qwidth", "seq", 
+     "isize")
> # ?scanBamFlag
> flag = scanBamFlag(isDuplicate = FALSE, isProperPair = TRUE, 
+     isPaired = TRUE)
> 
> param <- ScanBamParam(which = which, what = what, flag = flag)
> # load reads into R
> reads = scanBam(bamfile, param = param)
> 
> # Explore the reads
> names(reads)
## [1] "chr21:9827000-9836999"   "chr21:16267000-16276999"
## [3] "chr21:15890000-15899999"
> region1 <- reads$`chr21:9827000-9836999`
> names(region1)
## [1] "rname"  "strand" "pos"    "qwidth" "isize"  "seq"
> head(region1$rname)
## [1] chr21 chr21 chr21 chr21 chr21 chr21
## 25 Levels: chrM chr1 chr2 chr3 chr4 chr5 chr6 chr7 chr8 chr9 ... chrY
> head(region1$pos)
## [1] 9826933 9826933 9826934 9826934 9826934 9826940
> head(region1$qwidth)
## [1] 100 100 100 100 100 100
> head(region1$seq)
##   A DNAStringSet instance of length 6
##     width seq
## [1]   100 CCGGGCCCGTCCTCGCGAGGCCCCCCGGCCG...TACCTACCTGGTTGATCCTGCCACTAGCATA
## [2]   100 CCGGGCCCGTCCTCGCGAGGCCCCCCGGCCG...TAACTACCTGGTTGATTCTGCCAGTAGCATA
## [3]   100 CGGGCCCGTCCTCGCGAGGCCCCCCGGCCGG...ACCTACCTGGTTGATCCTGCCAGTAGCATAT
## [4]   100 CGGGCCCGTCCTCGCGAGGCCCCCCGGCCGG...ACCTACCTGGTTGATCCTGCCAGTAGCATAT
## [5]   100 CGGGCCCGTCCTCGCGAGGCCCCCCGGCCGG...ACCTACCTGGTTGATCCTGCCAGTAGCATAT
## [6]   100 CGTCCTCGCGAGGCCCCCCGGCCGGCCGTCC...CTGGTTGATCCTGCCAGTAGCATTTGCTTGT
> head(region1$isize)
## [1] 146 144 162 162 145 158

readGAlignments also loads BAM files into R directly into a GRanges object. The parameters are specified as in RSamtools with the argument param.

> # GenomicAlignments
> readsGA <- readGAlignments(bamfile, param = param)

Challenge 3

  1. Define a new ScanBamParam() object that satisfies the following options: use the same ranges as above, load both main alignments and PCR duplicates, properly paired and uniquely mapped reads.
  2. Plot an histogram of the fragment sizes of every read (Suggestion: look for the isize fields)

Compute number of reads that falls within 100bp bins

It is often useful to summarise the number of reads falling within a bin to have an idea about the distribution of the reads across a region. There are several ways in which this can be accomplished and here two will be shown. Often, a read is counted in a bin if its 5’ end falls within bin. This is the approach that we will consider.

> ## Compute number of reads that falls within 100bp
> ## bins
> bins <- GRanges(seqnames = rep("chr21", 11), IRanges(start = seq(9827000, 
+     9827000 + 10000, by = 1000), width = 1000))
> # Create vector of 0-1 positions
> positions <- region1$pos
> vector_positions <- rep(0, max(positions))
> vector_positions[positions] <- 1
> # Views Object
> rangeViews <- Views(vector_positions, start = seq(min(positions), 
+     max(positions), length.out = 10), width = 60)
> # viewSums
> bin_sum <- viewSums(rangeViews)
> # Add extra column to the ranges object
> binned_counts <- ranges(rangeViews)
> values(binned_counts) <- data.frame(ReadCounts = bin_sum)