12  Bioconductor

12.1 Introduction

Bioconductor is a collection of R packages specifically designed for the analysis of biological data. It is an open source, i.e. you have access to the source code of all the packages, and open development, i.e. you are welcome to contribute new packages, repository. It’s name reflect the it’s purpose which is to facilitate the use of a varied collection of packages geared towards biological research. Just like a Conductor in an Orchestra, the Bioconductor brings together individual specialized packages and symphonize their application. To install Bioconductor, run the following code on the R console.

if (!require("BiocManager", quietly = TRUE))
    install.packages("BiocManager")
BiocManager::install()

Once the BiocManager is installed, packages within the bioconductor repository can be installed by running BiocManager::install(<package name>) e.g. to install BiocPkgTools (which has information about different packages in the bioconductor repo) run BiocManager::install("BiocPkgTools").

BiocManager::install("Biostrings")
BiocManager::install("BiocPkgTools")
BiocManager::install("seqinr")

List of packages in the Bioconductor repository.

library(BiocPkgTools)
Loading required package: htmlwidgets
library(dplyr)

Attaching package: 'dplyr'
The following objects are masked from 'package:stats':

    filter, lag
The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union
p1 <- biocDownloadStats("software")
p1 %>%
  group_by(Package) %>%
  slice(which.max(Nb_of_downloads))%>%
  arrange(desc(Nb_of_downloads))
# A tibble: 4,968 × 7
# Groups:   Package [4,968]
   pkgType  Package     Year Month Nb_of_distinct_IPs Nb_of_downloads Date      
   <chr>    <chr>      <int> <chr>              <int>           <int> <date>    
 1 software BiocVersi…  2022 Mar                55093          406419 2022-03-01
 2 software flowCore    2017 May                 1155          390693 2017-05-01
 3 software GEOquery    2022 Oct                 8664          289449 2022-10-01
 4 software MeSHDbi     2022 Oct                  145          277080 2022-10-01
 5 software reticulate  2022 Oct                    5          276858 2022-10-01
 6 software minfi       2016 May                 1920          240775 2016-05-01
 7 software GenomeInf…  2022 May                49133          181946 2022-05-01
 8 software S4Vectors   2022 Mar                57317          146957 2022-03-01
 9 software DelayedAr…  2023 Mar                43541          144041 2023-03-01
10 software BiocGener…  2022 May                55640          140879 2022-05-01
# ℹ 4,958 more rows

12.2 Working with sequence data

library(seqinr)

Attaching package: 'seqinr'
The following object is masked from 'package:dplyr':

    count
print(aaa())
 [1] "Stp" "Ala" "Cys" "Asp" "Glu" "Phe" "Gly" "His" "Ile" "Lys" "Leu" "Met"
[13] "Asn" "Pro" "Gln" "Arg" "Ser" "Thr" "Val" "Trp" "Tyr"
print(a(aaa()))
 [1] "*" "A" "C" "D" "E" "F" "G" "H" "I" "K" "L" "M" "N" "P" "Q" "R" "S" "T" "V"
[20] "W" "Y"
print(a("Val"))
[1] "V"
INS_gene <- read.fasta("INS_gene.fasta")
length(INS_gene)
[1] 2
getAnnot(INS_gene)  
[[1]]
[1] ">NC_000011.10:c2161209-2159779 INS [organism=Homo sapiens] [GeneID=3630] [chromosome=11]"

[[2]]
[1] ">NC_060935.1:c2248857-2247427 INS [organism=Homo sapiens] [GeneID=3630] [chromosome=11]"
for(x in INS_gene){
  print(GC(x))
}
[1] 0.6464011
[1] 0.6470999

To computationally translate a gene sequence use getTrans function.

paste(getTrans(INS_gene[1])[[1]],collapse = "")
[1] "SPPGQAASEEAIKQVCSKGLCVRWAQDSRVAGPQAPALQQGGRGWAREACGGEPRGPKAGHLAFSLPQPCLSPRSLSFCHGPVDAPPAPAGAAGPLGT*PSRSLCEPTPVRLTPGGSSLPSVRGTRLLLHTQDPPGGRGPAG*ANCPLLPLAAPSHPLLLALPPSMGRRGQEAATQQGVRCTFLKRSSLGHVLKVTSSLWPSQNLSLRTVLASAAPRYIRGWARSSLHSPLKQMPRSPFLHPHLMTADSSVLLSKVLGDLGSQGAPRCLPLGEHPITPGGGRGCLPEWARPLSPGLTAAP*SGDGEDAGDRPWGEVLGSPVQAPTVTLPRGGGRRWDMWALGPVGPHPVWVTLPLTWVQPGWRWVGVRPRAGGQAGTVSP*LCPPVSLCLAAVPEPALRGTSWQWGRWSWAGALVQAACSPWPWRGPCRSVALWNNAVPASAPSTSWRTTATRRSPQAAPHPPPPAPREME*SP*TS"

To get the full sequence as a string instead of a vector use as.string function.

INS_gene2 <- read.fasta("INS_gene.fasta", as.string = T,  forceDNAtolower = FALSE)
INS_gene2$`NC_000011.10:c2161209-2159779`
[1] "AGCCCTCCAGGACAGGCTGCATCAGAAGAGGCCATCAAGCAGGTCTGTTCCAAGGGCCTTTGCGTCAGGTGGGCTCAGGATTCCAGGGTGGCTGGACCCCAGGCCCCAGCTCTGCAGCAGGGAGGACGTGGCTGGGCTCGTGAAGCATGTGGGGGTGAGCCCAGGGGCCCCAAGGCAGGGCACCTGGCCTTCAGCCTGCCTCAGCCCTGCCTGTCTCCCAGATCACTGTCCTTCTGCCATGGCCCTGTGGATGCGCCTCCTGCCCCTGCTGGCGCTGCTGGCCCTCTGGGGACCTGACCCAGCCGCAGCCTTTGTGAACCAACACCTGTGCGGCTCACACCTGGTGGAAGCTCTCTACCTAGTGTGCGGGGAACGAGGCTTCTTCTACACACCCAAGACCCGCCGGGAGGCAGAGGACCTGCAGGGTGAGCCAACTGCCCATTGCTGCCCCTGGCCGCCCCCAGCCACCCCCTGCTCCTGGCGCTCCCACCCAGCATGGGCAGAAGGGGGCAGGAGGCTGCCACCCAGCAGGGGGTCAGGTGCACTTTTTTAAAAAGAAGTTCTCTTGGTCACGTCCTAAAAGTGACCAGCTCCCTGTGGCCCAGTCAGAATCTCAGCCTGAGGACGGTGTTGGCTTCGGCAGCCCCGAGATACATCAGAGGGTGGGCACGCTCCTCCCTCCACTCGCCCCTCAAACAAATGCCCCGCAGCCCATTTCTCCACCCTCATTTGATGACCGCAGATTCAAGTGTTTTGTTAAGTAAAGTCCTGGGTGACCTGGGGTCACAGGGTGCCCCACGCTGCCTGCCTCTGGGCGAACACCCCATCACGCCCGGAGGAGGGCGTGGCTGCCTGCCTGAGTGGGCCAGACCCCTGTCGCCAGGCCTCACGGCAGCTCCATAGTCAGGAGATGGGGAAGATGCTGGGGACAGGCCCTGGGGAGAAGTACTGGGATCACCTGTTCAGGCTCCCACTGTGACGCTGCCCCGGGGCGGGGGAAGGAGGTGGGACATGTGGGCGTTGGGGCCTGTAGGTCCACACCCAGTGTGGGTGACCCTCCCTCTAACCTGGGTCCAGCCCGGCTGGAGATGGGTGGGAGTGCGACCTAGGGCTGGCGGGCAGGCGGGCACTGTGTCTCCCTGACTGTGTCCTCCTGTGTCCCTCTGCCTCGCCGCTGTTCCGGAACCTGCTCTGCGCGGCACGTCCTGGCAGTGGGGCAGGTGGAGCTGGGCGGGGGCCCTGGTGCAGGCAGCCTGCAGCCCTTGGCCCTGGAGGGGTCCCTGCAGAAGCGTGGCATTGTGGAACAATGCTGTACCAGCATCTGCTCCCTCTACCAGCTGGAGAACTACTGCAACTAGACGCAGCCCGCAGGCAGCCCCACACCCGCCGCCTCCTGCACCGAGAGAGATGGAATAAAGCCCTTGAACCAGC"
attr(,"name")
[1] "NC_000011.10:c2161209-2159779"
attr(,"Annot")
[1] ">NC_000011.10:c2161209-2159779 INS [organism=Homo sapiens] [GeneID=3630] [chromosome=11]"
attr(,"class")
[1] "SeqFastadna"

12.3 Samtools in R

BiocManager::install("Rsamtools")
library(Rsamtools)

The BamFile function is used to read an alignment in .bam format. The code below read a bam file available at the Sequence Read Archive (link). To get the summary of the bam file run quickBamFlagSummary with BamFile object as a argument.

bam_file <- BamFile("SRR11880777_chrUn_GL000213v1.bam")
quickBamFlagSummary(bam_file)
                                group |    nb of |    nb of | mean / max
                                   of |  records |   unique | records per
                              records | in group |   QNAMEs | unique QNAME
All records........................ A |      188 |      188 |    1 / 1
  o template has single segment.... S |      188 |      188 |    1 / 1
  o template has multiple segments. M |        0 |        0 |   NA / NA
      - first segment.............. F |        0 |        0 |   NA / NA
      - last segment............... L |        0 |        0 |   NA / NA
      - other segment.............. O |        0 |        0 |   NA / NA

Note that (S, M) is a partitioning of A, and (F, L, O) is a partitioning of M.
Indentation reflects this.

Details for group S:
  o record is mapped.............. S1 |      188 |      188 |    1 / 1
      - primary alignment......... S2 |        3 |        3 |    1 / 1
      - secondary alignment....... S3 |      185 |      185 |    1 / 1
  o record is unmapped............ S4 |        0 |        0 |   NA / NA

The scanBam function can be used to parse the bam files. By default, the entire bam file is processed. To limit the number of sequences to process. yieldSize function can be used.

yieldSize(bam_file) <- 1
open(bam_file)
scanBam(bam_file)
[[1]]
[[1]]$qname
[1] "SRR11880777.16337755"

[[1]]$flag
[1] 0

[[1]]$rname
[1] chrUn_GL000213v1
195 Levels: chr1 chr2 chr3 chr4 chr5 chr6 chr7 chr8 chr9 chr10 chr11 ... chrEBV

[[1]]$strand
[1] +
Levels: + - *

[[1]]$pos
[1] 16493

[[1]]$qwidth
[1] 51

[[1]]$mapq
[1] 3

[[1]]$cigar
[1] "51M"

[[1]]$mrnm
[1] <NA>
195 Levels: chr1 chr2 chr3 chr4 chr5 chr6 chr7 chr8 chr9 chr10 chr11 ... chrEBV

[[1]]$mpos
[1] NA

[[1]]$isize
[1] 0

[[1]]$seq
DNAStringSet object of length 1:
    width seq
[1]    51 TAGACTTTAAGTTCTAGGGTACATTTGCACAAAGTGCAGGTTTGTTAAATA

[[1]]$qual
PhredQuality object of length 1:
    width seq
[1]    51 ???????????????????????????????????????????????????

Once the parsing of bam file is complete, close the file and reset the yieldSize.

close(bam_file)
yieldSize(bam_file) <- NA

12.4 Working with structure data

The bio3d package has a set of functions for working with macromolecular structure data. This library can be used to analyze not just individual structure files (e.g. pdb files) but also to analyze trajectories such as dcd files. To install this library run install.package(bio3d) and to get a list function available in this library run help(package=bio3d). Below we’ll look at some of the basic functions in bio3d.

library(bio3d)

The read.pdb function is used to read a structure in pdb format. This function returns an object that has different attributes to access relevant structural information. E.g., the atom attribute returns a dataframe for all ATOM and HETATM fields in the pdb file.

pdb1 <- read.pdb("5ivq.pdb")
# Summary of the pdb file
print(pdb1)

 Call:  read.pdb(file = "5ivq.pdb")

   Total Models#: 1
     Total Atoms#: 1770,  XYZs#: 5310  Chains#: 2  (values: A B)

     Protein Atoms#: 1520  (residues/Calpha atoms#: 198)
     Nucleic acid Atoms#: 0  (residues/phosphate atoms#: 0)

     Non-protein/nucleic Atoms#: 250  (residues: 220)
     Non-protein/nucleic resid values: [ 6EH (1), CL (3), HOH (216) ]

   Protein sequence:
      PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMNLPGRWKPKMIGGIGGFIKVRQYD
      QILIEICGHKAIGTVLVGPTPVNIIGRNLLTQIGCTLNFPQITLWQRPLVTIKIGGQLKE
      ALLDTGADDTVLEEMNLPGRWKPKMIGGIGGFIKVRQYDQILIEICGHKAIGTVLVGPTP
      VNIIGRNLLTQIGCTLNF

+ attr: atom, xyz, seqres, helix, sheet,
        calpha, remark, call
tibble(pdb1$atom)
# A tibble: 1,770 × 16
   type  eleno elety alt   resid chain resno insert      x     y     z     o
   <chr> <int> <chr> <chr> <chr> <chr> <int> <chr>   <dbl> <dbl> <dbl> <dbl>
 1 ATOM      1 N     <NA>  PRO   A         1 <NA>   -0.303  2.84 -5.21     1
 2 ATOM      2 CA    <NA>  PRO   A         1 <NA>    0.594  3.92 -4.78     1
 3 ATOM      3 C     <NA>  PRO   A         1 <NA>    0.047  4.72 -3.61     1
 4 ATOM      4 O     <NA>  PRO   A         1 <NA>   -1.13   4.59 -3.27     1
 5 ATOM      5 CB    <NA>  PRO   A         1 <NA>    0.691  4.81 -6.02     1
 6 ATOM      6 CG    <NA>  PRO   A         1 <NA>   -0.543  4.54 -6.80     1
 7 ATOM      7 CD    <NA>  PRO   A         1 <NA>   -0.9    3.11 -6.54     1
 8 ATOM      8 N     <NA>  GLN   A         2 <NA>    0.909  5.54 -3.00     1
 9 ATOM      9 CA    <NA>  GLN   A         2 <NA>    0.502  6.44 -1.92     1
10 ATOM     10 C     <NA>  GLN   A         2 <NA>    0.608  7.83 -2.53     1
# ℹ 1,760 more rows
# ℹ 4 more variables: b <dbl>, segid <chr>, elesy <chr>, charge <chr>

The calpha attribute of the pdb object returns a Boolean value depending upon whether the atom name is CA or not. The calpha attribute of the pdb object returns a Boolean value depending upon whether the atom name is CA or not. This can used to subset the atom dataframe to exact e.g., the sequence of the protein based on the ATOM record. Note that this would give the sequence for all the chains in the pdb file.

seq_prot <- pdb1$atom$resid[pdb1$calpha]
cat(aa321(seq_prot), sep = "")
PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMNLPGRWKPKMIGGIGGFIKVRQYDQILIEICGHKAIGTVLVGPTPVNIIGRNLLTQIGCTLNFPQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMNLPGRWKPKMIGGIGGFIKVRQYDQILIEICGHKAIGTVLVGPTPVNIIGRNLLTQIGCTLNF

Similarly, we can subset the atom dataframe to extract B-factors. The B-factor plot can be annotated with secondary structure using the secondary structure elements (sse) class of the pdb object.

plot.bio3d(pdb1$atom$b[pdb1$calpha], sse=pdb1, typ="l", ylab="B-factor")

There bio3d library has some utility functions to analyze structures e.g. the rgyr function returns the radius of gyration for a given structure.

rgyr(pdb1)
[1] 17.64543

The binding.site function return the residues of the protein within a specified distance cutoff from the ligand. The protein and ligand moities within the pdb object are detected automatically. There is also an option to specify atom mask to identify interface residues as required.

bs <- binding.site(pdb1, cutoff = 3.5)
cat(bs$resnames, sep = ",")
ASP 25 (A),GLY 27 (A),ASP 29 (A),GLY 48 (A),GLY 49 (A),THR 74 (A),ASN 88 (A),TRP 6 (B),ARG 8 (B),ASP 25 (B),THR 74 (B),ASN 88 (B)

To find interface residues between two protein chains we first need to make atom selections corresponding to the two chains. Then use these atom masks as arguments for the binding.site function.

chainA.inds <- atom.select(pdb1, chain="A")
chainB.inds <- atom.select(pdb1, chain="B")
## Residues of Chain A around Chain B.
chainA_resi <- binding.site(pdb1, a.inds=chainA.inds, b.inds=chainB.inds, cutoff = 3)
cat(chainA_resi$resnames, "\n", sep = ",")
PRO 1 (A),GLN 2 (A),ILE 3 (A),LEU 5 (A),ARG 8 (A),LEU 24 (A),ASP 25 (A),THR 26 (A),ASP 29 (A),GLY 51 (A),PHE 53 (A),ARG 87 (A),ILE 93 (A),GLY 94 (A),THR 96 (A),LEU 97 (A),ASN 98 (A),PHE 99 (A),6EH 101 (A),HOH 207 (A),HOH 225 (A),HOH 232 (A),HOH 234 (A),HOH 238 (A),HOH 262 (A),HOH 271 (A),HOH 280 (A),HOH 286 (A),
## Residues of Chain B around Chain A.
chainB_resi <- binding.site(pdb1, a.inds=chainB.inds, b.inds=chainA.inds, cutoff = 3)
cat(chainB_resi$resnames, sep = ",")
PRO 1 (B),GLN 2 (B),ILE 3 (B),LEU 5 (B),ARG 8 (B),LEU 24 (B),ASP 25 (B),THR 26 (B),ASP 29 (B),ILE 50 (B),ARG 87 (B),ILE 93 (B),THR 96 (B),LEU 97 (B),ASN 98 (B),PHE 99 (B),HOH 207 (B),HOH 208 (B),HOH 216 (B),HOH 224 (B),HOH 232 (B),HOH 240 (B),HOH 242 (B),HOH 247 (B),HOH 252 (B),HOH 269 (B),HOH 285 (B),HOH 305 (B)

Contact map

inds <- atom.select(pdb1, "calpha")
ref.cont <- cmap( pdb1$xyz[inds$xyz], dcut=6, scut=3)
plot.cmap(ref.cont, sse=pdb1, asp=1)
plot.cmap(t(ref.cont), col=3, pch=17, add=TRUE, asp=1)