- Bioconductor: http://bioconductor.org/
- 900+ packages for biostatistics
- Developed by top biostatistics researchers
- Extensive documentation: emphasizes reproducible research
November 14, 2014
To install or upgrade bioconductor run:
source("http://bioconductor.org/biocLite.R") biocLite()
you only need to run these commands once, not every session.
For this workshop we'll need several packages, not all are included in the basic bioconductor installation:
biocLite("limma") biocLite( "pathview") biocLite(c("breastCancerMAINZ", "hgu133a.db"))
In each R session where you will work with bioconductor, you need to load the packages. I'll load in some publicly available data on breast cancer.
library(breastCancerMAINZ) data(mainz) library(Biobase)
## Loading required package: BiocGenerics ## Loading required package: parallel ## ## Attaching package: 'BiocGenerics' ## ## The following objects are masked from 'package:parallel': ## ## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ, ## clusterExport, clusterMap, parApply, parCapply, parLapply, ## parLapplyLB, parRapply, parSapply, parSapplyLB ## ## The following object is masked from 'package:stats': ## ## xtabs ## ## The following objects are masked from 'package:base': ## ## anyDuplicated, append, as.data.frame, as.vector, cbind, ## colnames, do.call, duplicated, eval, evalq, Filter, Find, get, ## intersect, is.unsorted, lapply, Map, mapply, match, mget, ## order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank, ## rbind, Reduce, rep.int, rownames, sapply, setdiff, sort, ## table, tapply, union, unique, unlist, unsplit ## ## Welcome to Bioconductor ## ## Vignettes contain introductory material; view with ## 'browseVignettes()'. To cite Bioconductor, see ## 'citation("Biobase")', and for packages 'citation("pkgname")'.
mainz
DataThe mainz
experiment is described in http://cancerres.aacrjournals.org/content/68/13/5405.abstract
dim(mainz)
## Features Samples ## 22283 200
it contains 200 arrays and measures 22,000+ genes.
Each array is labeled with phenotype data, in this case it includes many variables:
colnames(pData(mainz))
## [1] "samplename" "dataset" "series" "id" ## [5] "filename" "size" "age" "er" ## [9] "grade" "pgr" "her2" "brca.mutation" ## [13] "e.dmfs" "t.dmfs" "node" "t.rfs" ## [17] "e.rfs" "treatment" "tissue" "t.os" ## [21] "e.os"
Here is the expression of one gene in all 200 arrays.
row = 21399 plot(exprs(mainz)[row,])
Let's build a factor to separate the patients that developed distant metastases within 5 years.
mainz.class <- (pData(mainz)$e.dmfs & pData(mainz)$t.dmfs < 5*365) mainz.fac <- factor(mainz.class, labels= c("CLEAR","DM")) sum(mainz.class)
## [1] 28
stripchart(exprs(mainz)[row,] ~ mainz.fac, method="jitter")
limma
Packagelibrary(limma)
## ## Attaching package: 'limma' ## ## The following object is masked from 'package:BiocGenerics': ## ## plotMA
?limma
Each array has 22,000+ genes.
plotMA(mainz)
Instead of running 22,000+ t-tests, we can fit a linear model. The limma
User's Guide has many more examples.
design <- model.matrix(~0 + mainz.fac) colnames(design) <- c("CLEAR", "DM") fit <- lmFit(exprs(mainz), design) cont.matrix <- makeContrasts(DMvsCLEAR = DM - CLEAR, levels = design) fit2 <- contrasts.fit(fit, cont.matrix) fit.b <- eBayes(fit2)
The plot of the fit object shows the size of the predicted effect on each gene.
plotMA(fit.b)
The volcanoplot shows the log odds of differential expression vs the log ratio of expression.
volcanoplot(fit.b)
The topTable lists the probe sets ranked by the strength of evidence for differential expression.
topTable(fit.b, adjust = "BH")
## logFC AveExpr t P.Value adj.P.Val ## 222039_at 1.0241907 8.094413 5.387081 1.988696e-07 0.002835900 ## 218009_s_at 0.9425805 9.172245 5.300403 3.023342e-07 0.002835900 ## 57703_at 0.4721307 7.167348 5.215717 4.531487e-07 0.002835900 ## 221760_at -0.9685860 9.995876 -5.149831 6.188802e-07 0.002835900 ## 200830_at 0.3632375 11.382618 5.143923 6.363370e-07 0.002835900 ## 204156_at -0.8234357 7.522004 -5.072697 8.882609e-07 0.003298853 ## 202412_s_at 0.5866401 8.633824 4.984981 1.333395e-06 0.003808274 ## 204825_at 1.1068133 8.883956 4.974329 1.400342e-06 0.003808274 ## 213007_at 0.6347787 8.397143 4.931885 1.700912e-06 0.003808274 ## 203799_at -0.6801316 10.016417 -4.907973 1.896838e-06 0.003808274 ## B ## 222039_at 6.737055 ## 218009_s_at 6.358004 ## 57703_at 5.992056 ## 221760_at 5.710387 ## 200830_at 5.685258 ## 204156_at 5.384055 ## 202412_s_at 5.017503 ## 204825_at 4.973321 ## 213007_at 4.797999 ## 203799_at 4.699734
We can use bioconductor annotation packages to find information for the probes in our dataset.
library(hgu133a.db)
## Loading required package: AnnotationDbi ## Loading required package: stats4 ## Loading required package: GenomeInfoDb ## Loading required package: S4Vectors ## Loading required package: IRanges ## ## Attaching package: 'AnnotationDbi' ## ## The following object is masked from 'package:GenomeInfoDb': ## ## species ## ## Loading required package: org.Hs.eg.db ## Loading required package: DBI
hgu133a.db
ls("package:hgu133a.db")
## [1] "hgu133a" "hgu133a_dbconn" "hgu133a_dbfile" ## [4] "hgu133a_dbInfo" "hgu133a_dbschema" "hgu133a.db" ## [7] "hgu133aACCNUM" "hgu133aALIAS2PROBE" "hgu133aCHR" ## [10] "hgu133aCHRLENGTHS" "hgu133aCHRLOC" "hgu133aCHRLOCEND" ## [13] "hgu133aENSEMBL" "hgu133aENSEMBL2PROBE" "hgu133aENTREZID" ## [16] "hgu133aENZYME" "hgu133aENZYME2PROBE" "hgu133aGENENAME" ## [19] "hgu133aGO" "hgu133aGO2ALLPROBES" "hgu133aGO2PROBE" ## [22] "hgu133aMAP" "hgu133aMAPCOUNTS" "hgu133aOMIM" ## [25] "hgu133aORGANISM" "hgu133aORGPKG" "hgu133aPATH" ## [28] "hgu133aPATH2PROBE" "hgu133aPFAM" "hgu133aPMID" ## [31] "hgu133aPMID2PROBE" "hgu133aPROSITE" "hgu133aREFSEQ" ## [34] "hgu133aSYMBOL" "hgu133aUNIGENE" "hgu133aUNIPROT"
x <- hgu133aPATH2PROBE # Get the probe identifiers that are mapped to a KEGG pathway mapped_probes <- mappedkeys(x) # Convert to a list xx <- as.list(x[mapped_probes]) # build a list for mroast indices <- ids2indices(xx, rownames(mainz))
mroast
Run mroast
to find pathways with differential expression.
res <- mroast(mainz, indices, design) head(res)
## NGenes PropDown PropUp Direction PValue FDR PValue.Mixed FDR.Mixed ## 01100 1457 0 1 Up 0.001 5e-04 0.001 5e-04 ## 05200 619 0 1 Up 0.001 5e-04 0.001 5e-04 ## 04010 475 0 1 Up 0.001 5e-04 0.001 5e-04 ## 04510 402 0 1 Up 0.001 5e-04 0.001 5e-04 ## 04080 380 0 1 Up 0.001 5e-04 0.001 5e-04 ## 04810 379 0 1 Up 0.001 5e-04 0.001 5e-04
To see the pathways we need a data matrix with ENTREZ Gene ID as the rowname
x <- hgu133aENTREZID # Convert to a list xx <- as.list(x) entrezid <- sapply(rownames(fit.b), function(x) xx[x], USE.NAMES=FALSE)
pathview
Packagelibrary(pathview)
## Loading required package: KEGGgraph ## Loading required package: XML ## Loading required package: graph ## ## Attaching package: 'graph' ## ## The following object is masked from 'package:XML': ## ## addNode ## ## ############################################################################## ## Pathview is an open source software package distributed under GNU General ## Public License version 3 (GPLv3). Details of GPLv3 is available at ## http://www.gnu.org/licenses/gpl-3.0.html. ## ## The pathview downloads and uses KEGG data. Academic users may freely use the ## KEGG website at http://www.kegg.jp/ or its mirror site at GenomeNet ## http://www.genome.jp/kegg/. Academic users may also freely link to the KEGG ## website. Non-academic users may use the KEGG website as end users for ## non-commercial purposes, but any other use requires a license agreement ## (details at http://www.kegg.jp/kegg/legal.html). ## ##############################################################################
gene.data <- fit.b$coefficients rownames(gene.data) <- entrezid
pv.out <- pathview(gene.data = gene.data, pathway.id = "04010", species = "hsa", out.suffix = "dmvsclear", kegg.native = T, same.layer = F)
## Info: Working in directory /Users/humberto/Documents/class/2014/expression-workshop ## Info: Writing image file hsa04010.dmvsclear.png