Introduction

1. Libraries and Workspace

suppressPackageStartupMessages(library(dplyr))
suppressPackageStartupMessages(library(dendextend))
suppressPackageStartupMessages(library(tidyr))
suppressPackageStartupMessages(library(textshape))
suppressPackageStartupMessages(library(stringr))
suppressPackageStartupMessages(library(vegan))
suppressPackageStartupMessages(library(pracma))
suppressPackageStartupMessages(library(ggplot2))
suppressPackageStartupMessages(library(readxl))

session_info()

# ─ Session info ───────────────────────────────────────
#  setting  value
#  version  R version 4.3.2 (2023-10-31)
#  os       Manjaro Linux
#  system   x86_64, linux-gnu
#  ui       RStudio
#  language (EN)
#  collate  en_US.UTF-8
#  ctype    en_US.UTF-8
#  tz       America/Mexico_City
#  date     2024-02-21
#  rstudio  2023.09.1+494 Desert Sunflower (desktop)
#  pandoc   3.1.6 @ /usr/bin/ (via rmarkdown)
# 
# ─ Packages ───────────────────────────────────────────
#  package       * version date (UTC) lib source
#  ape             5.7-1   2023-03-13 [1] CRAN (R 4.3.2)
#  cellranger      1.1.0   2016-07-27 [1] CRAN (R 4.3.2)
#  circlize      * 0.4.16  2024-02-20 [1] CRAN (R 4.3.2)
#  cli             3.6.2   2023-12-11 [1] CRAN (R 4.3.2)
#  cluster         2.1.4   2022-08-22 [2] CRAN (R 4.3.2)
#  colorspace      2.1-0   2023-01-23 [1] CRAN (R 4.3.1)
#  data.table      1.15.0  2024-01-30 [1] CRAN (R 4.3.2)
#  dendextend    * 1.17.1  2023-03-25 [1] CRAN (R 4.3.2)
#  digest          0.6.34  2024-01-11 [1] CRAN (R 4.3.2)
#  dplyr         * 1.1.4   2023-11-17 [1] CRAN (R 4.3.2)
#  evaluate        0.23    2023-11-01 [1] CRAN (R 4.3.2)
#  factoextra      1.0.7   2020-04-01 [1] CRAN (R 4.3.2)
#  fansi           1.0.6   2023-12-08 [1] CRAN (R 4.3.2)
#  fastmap         1.1.1   2023-02-24 [1] CRAN (R 4.3.2)
#  generics        0.1.3   2022-07-05 [1] CRAN (R 4.3.2)
#  ggplot2       * 3.4.4   2023-10-12 [1] CRAN (R 4.3.2)
#  ggrepel         0.9.5   2024-01-10 [1] CRAN (R 4.3.2)
#  GlobalOptions   0.1.2   2020-06-10 [1] CRAN (R 4.3.2)
#  glue            1.7.0   2024-01-09 [1] CRAN (R 4.3.2)
#  gridExtra       2.3     2017-09-09 [1] CRAN (R 4.3.2)
#  gtable          0.3.4   2023-08-21 [1] CRAN (R 4.3.1)
#  htmltools       0.5.7   2023-11-03 [1] CRAN (R 4.3.2)
#  knitr           1.45    2023-10-30 [1] CRAN (R 4.3.2)
#  lattice       * 0.21-9  2023-10-01 [2] CRAN (R 4.3.2)
#  lifecycle       1.0.4   2023-11-07 [1] CRAN (R 4.3.2)
#  magrittr        2.0.3   2022-03-30 [1] CRAN (R 4.3.1)
#  MASS            7.3-60  2023-05-04 [2] CRAN (R 4.3.2)
#  Matrix          1.6-1.1 2023-09-18 [2] CRAN (R 4.3.2)
#  mgcv            1.9-0   2023-07-11 [2] CRAN (R 4.3.2)
#  munsell         0.5.0   2018-06-12 [1] CRAN (R 4.3.1)
#  nlme            3.1-163 2023-08-09 [2] CRAN (R 4.3.2)
#  permute       * 0.9-7   2022-01-27 [1] CRAN (R 4.3.2)
#  pheatmap      * 1.0.12  2019-01-04 [1] CRAN (R 4.3.2)
#  pillar          1.9.0   2023-03-22 [1] CRAN (R 4.3.1)
#  pkgconfig       2.0.3   2019-09-22 [1] CRAN (R 4.3.1)
#  pracma        * 2.4.4   2023-11-10 [1] CRAN (R 4.3.2)
#  purrr           1.0.2   2023-08-10 [1] CRAN (R 4.3.2)
#  R6              2.5.1   2021-08-19 [1] CRAN (R 4.3.1)
#  RColorBrewer    1.1-3   2022-04-03 [1] CRAN (R 4.3.1)
#  Rcpp            1.0.12  2024-01-09 [1] CRAN (R 4.3.2)
#  readxl        * 1.4.3   2023-07-06 [1] CRAN (R 4.3.2)
#  rlang           1.1.3   2024-01-10 [1] CRAN (R 4.3.2)
#  rmarkdown       2.25    2023-09-18 [1] CRAN (R 4.3.2)
#  rsconnect       1.2.0   2023-12-15 [1] CRAN (R 4.3.2)
#  rstudioapi      0.15.0  2023-07-07 [1] CRAN (R 4.3.2)
#  scales          1.3.0   2023-11-28 [1] CRAN (R 4.3.2)
#  sessioninfo   * 1.2.2   2021-12-06 [1] CRAN (R 4.3.2)
#  shape           1.4.6   2021-05-19 [1] CRAN (R 4.3.2)
#  stringi         1.8.3   2023-12-11 [1] CRAN (R 4.3.2)
#  stringr       * 1.5.1   2023-11-14 [1] CRAN (R 4.3.2)
#  textshape     * 1.7.3   2021-05-28 [1] CRAN (R 4.3.2)
#  tibble          3.2.1   2023-03-20 [1] CRAN (R 4.3.1)
#  tidyr         * 1.3.1   2024-01-24 [1] CRAN (R 4.3.2)
#  tidyselect      1.2.0   2022-10-10 [1] CRAN (R 4.3.2)
#  utf8            1.2.4   2023-10-22 [1] CRAN (R 4.3.1)
#  vctrs           0.6.5   2023-12-01 [1] CRAN (R 4.3.2)
#  vegan         * 2.6-4   2022-10-11 [1] CRAN (R 4.3.2)
#  viridis         0.6.5   2024-01-29 [1] CRAN (R 4.3.2)
#  viridisLite     0.4.2   2023-05-02 [1] CRAN (R 4.3.1)
#  withr           3.0.0   2024-01-16 [1] CRAN (R 4.3.2)
#  xfun            0.41    2023-11-01 [1] CRAN (R 4.3.2)
#  yaml            2.3.8   2023-12-11 [1] CRAN (R 4.3.2)
# 
#  [1] /home/chombo/R/x86_64-pc-linux-gnu-library/4.3
#  [2] /usr/lib/R/library

2. Data Preparation

BASH CLEANING

## First we cleaned the data from the 52514 MAGs from Nayfach project
## Here we extracted the taxonomic classification column and split it into different columns
## Then we merge the output with the complete Nayfach project metadata
paste <(cut -f1,15 AllMAGs-genome_metadata.tsv | tr ";" "\t" | awk '{ if (NR != 1) print }' | sed 's/[a-z]__//g' | awk -F "\t" '{ print $2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$8 }') <(awk -F "\t" '{if (NR != 1) print}' AllMAGs-genome_metadata.tsv | cut --complement -d$'\t' -f15) > MAGs_test.tsv

R CLEANING

## THIS IS IN THE '01_MetadataCleaning.R'
## Then we filtered the MAGs by the following parameters
## Completness >= 99
## Contamination <= 0.05

## Read the file
setwd(dir = '/home/chombo/Documents/SEGOVIA/CAZymes_PULs/data/')
colnames.vec <- c("division","phylum","class","order","family","specie","genome_id","metagenome_id","genome_length","num_contigs","n50","num_16s","num_5s","num_23s","num_trna","completeness","contamination","quality_score","mimag_quality","otu_id","ecosystem_category","ecosystem_type","habitat","longitude","latitude")
MAGS.df <- read.table('MAGs_test.tsv',sep = "\t",col.names = colnames.vec)

## Filter the file
## Completness >= 99%
## Contamination <= 0.05%
MAGS.filtered <- subset(MAGS.df,completeness >= 99 & contamination <= 0.05 & family != "")
MAGS.filtered <- MAGS.filtered %>%
    select(genome_id,metagenome_id,everything())
MAGS.filtered <- MAGS.filtered[order(MAGS.filtered$genome_id),]

## Rename specific cases
MAGS.filtered$class[MAGS.filtered$class == "Bacilli_A"] <- "Bacilli"
MAGS.filtered$class[MAGS.filtered$class == "Thermoplasmata_A"] <- "Thermoplasmata"
MAGS.filtered$order[MAGS.filtered$order == "Bacillales_A"] <- "Bacillales"

## Save the filtered file
## Output: 1069 MAGs
# write.table(MAGS.filtered,file = "1069_MAGs_metadata.tsv",sep = "\t",row.names = FALSE)

## Filter diversity by ecosystem type, class, order and family
## Ecosystem
eco.div <- as.data.frame(table(MAGS.filtered$ecosystem_category))
colnames(eco.div) <- c("ecosystem","frequency")

## Class
class.div <- as.data.frame(table(MAGS.filtered$class))
colnames(class.div) <- c("class","frequency")

## Order
order.div <- as.data.frame(table(MAGS.filtered$order))
colnames(order.div) <- c("order","frequency")

## Family
fam.div <- as.data.frame(table(MAGS.filtered$family))
colnames(fam.div) <- c("family","frequency")

# write.table(MAGS.filtered$genome_id,file = "1069MAGS_list.txt",sep = "",row.names = FALSE,quote = FALSE,col.names = FALSE)
# write.table(eco.div$ecosystem,file = "1069Eco_list.txt",sep = "",row.names = FALSE,quote = FALSE,col.names = FALSE)
# write.table(class.div$class,file = "1069Class_list.txt",sep = "",row.names = FALSE,quote = FALSE,col.names = FALSE)
# write.table(order.div$order,file = "1069Order_list.txt",sep = "",row.names = FALSE,quote = FALSE,col.names = FALSE)
# write.table(fam.div$family,file = "1069Fam_list.txt",sep = "",row.names = FALSE,quote = FALSE,col.names = FALSE)

print(paste0("Total Filtered MAGs: ",length(unique(c(MAGS.filtered$genome_id)))))

## [1] "Total Filtered MAGs: 1069"

3. Get PULs frequency and CAZymes frequency

Here we are going to wotk with the 1069 filtered MAGs. DBcan4 had already been run and DIAMOND had also already been run for obtaining the PULs. These scripts are in the ‘/src/BASH/’ folder inside the repo.

3.1 PULs counts

## PULs Counts
## Import MAGs list
setwd(dir = '/home/chombo/Documents/SEGOVIA/CAZymes_PULs/data/')
MAGS.arr <- as.vector(read.table("1069MAGS_list.txt"))[[1]]

## Create a df with all the PUL counts according to each MAG
PULS.df <- data.frame()
for (mag in MAGS.arr) {
    ## Filter the 5 MAGs that do not have significant PULs associated with them
    if (mag == "3300005326_35" || mag == "3300005370_1" || mag == "3300017650_9" || mag == "3300027284_65" || mag == "3300027607_50") {
        next
    }
    df.temp <- read.table(paste(mag,"/",mag,".matches_evalue.out",sep = ""))
    colnames(df.temp) <- c("PULs","evalue")
    df.temp["MAG"] <- mag
    
    ## Save the PULs frequency
    freq <- as.data.frame(table(df.temp$PULs))
    freq <- freq[order(freq$Freq,decreasing = TRUE),]
    write.table(freq,file = paste(mag,"/",mag,".matches_counts.out",sep = ""),sep = "\t",quote = FALSE,col.names = FALSE,row.names = FALSE)
    
    ## Store them in a single df
    PULS.df <- rbind(PULS.df,df.temp)
}

print(paste0(length(PULS.df$PULs)," total PULs"))

## [1] "36172 total PULs"

print(paste0(length(unique(PULS.df$PULs))," different PULs"))

## [1] "1363 different PULs"

print(paste0(length(unique(PULS.df$MAG))," final MAGs"))

## [1] "1064 final MAGs"

## 36172 total PULs
## 1363 different PULs
## 1064 final MAGs

## Get the summary
PULSperMAG.df <- as.data.frame(table(PULS.df$MAG))
colnames(PULSperMAG.df) <- c("MAGs","PULs_Freq")
excludedMAGS <- data.frame(MAGs = c("3300005326_35","3300005370_1","3300017650_9","3300027284_65","3300027607_50"),
                           PULs_Freq = c(0,0,0,0,0))
PULSperMAG.df <- rbind(PULSperMAG.df,excludedMAGS)

## Get the PULs diversity
puls.diversity <- as.data.frame(table(PULS.df$PULs))

3.1.1 PULs Ecosystem counts

## Read the metadata and filter it
setwd(dir = '/home/chombo/Documents/SEGOVIA/CAZymes_PULs/data/')
MAGS.meta <- read.table(file = "1069_MAGs_metadata.tsv",sep = "\t",header = TRUE)
MAGS.meta <- MAGS.meta %>%
    select(genome_id,class,order,family,ecosystem_category)
MAGS.meta$ecosystem_category <- gsub(" ","_",MAGS.meta$ecosystem_category)

## Remove the excluded MAGs
MAGS.meta <- MAGS.meta[!is.element(MAGS.meta$genome_id,c("3300005326_35","3300005370_1","3300017650_9","3300027284_65","3300027607_50")),]

## PULs per Ecosystem
eco.arr <- MAGS.meta %>%
    distinct(ecosystem_category)
eco.arr <- eco.arr[order(eco.arr$ecosystem_category,decreasing = FALSE),]

full.div.df <- data.frame()
for (eco in eco.arr) {
    ## Extract all the MAGs annotated in each Ecosystem
    mags.arr.temp <- as.vector(MAGS.meta$genome_id[MAGS.meta$ecosystem_category == eco])
    
    ## For each MAG, extract all the PUL counts
    PULSperMAG.temp <- PULSperMAG.df[is.element(PULSperMAG.df$MAGs,mags.arr.temp),]    
    
    ## Extract the complete diversity
    eco.div.df <- data.frame(ecosystem = c(eco),
                             frequency = sum(PULSperMAG.temp$PULs_Freq))
    full.div.df <- rbind(full.div.df,eco.div.df)
    
    ## Extract the PULs diversity
    rawPULs.temp <- PULS.df[is.element(PULS.df$MAG,mags.arr.temp),]
    pulscounts.temp <- as.data.frame(table(rawPULs.temp$PULs))
    colnames(pulscounts.temp) <- c("PULs","frequency")
    pulscounts.temp <- pulscounts.temp[order(pulscounts.temp$frequency,decreasing = TRUE),]
}

head(full.div.df,24L)

##            ecosystem frequency
## 1              Algae        17
## 2             Animal       599
## 3           Annelida        33
## 4            Aquatic      6186
## 5         Arthropoda       344
## 6         Bioreactor       856
## 7     Bioremediation       107
## 8  Biotransformation        38
## 9              Birds       155
## 10 Built_environment      5285
## 11          Cnidaria        25
## 12             Fungi        75
## 13             Human     16581
## 14           Insecta       390
## 15    Lab_enrichment       892
## 16     Lab_synthesis       119
## 17           Mammals       782
## 18         Microbial       109
## 19           Modeled        29
## 20            Plants       422
## 21       Solid_waste       247
## 22       Terrestrial      2134
## 23        Wastewater       747

print(paste0(sum(full.div.df$frequency)," total PULs"))

## [1] "36172 total PULs"

3.1.2 PULs Class counts

## Read the metadata and filter it
setwd(dir = '/home/chombo/Documents/SEGOVIA/CAZymes_PULs/data/')

## PULs per Class
class.arr <- MAGS.meta %>%
    distinct(class)
class.arr <- class.arr[order(class.arr$class,decreasing = FALSE),]

full.div.df <- data.frame()
for (class in class.arr) {
    ## Extract all the MAGs annotated in each Class
    mags.arr.temp <- as.vector(MAGS.meta$genome_id[MAGS.meta$class == class])
    
    ## For each MAG, extract all the PUL counts
    PULSperMAG.temp <- PULSperMAG.df[is.element(PULSperMAG.df$MAGs,mags.arr.temp),]    
    
    ## Extract the complete diversity
    class.div.df <- data.frame(class = c(class),
                               frequency = sum(PULSperMAG.temp$PULs_Freq))
    full.div.df <- rbind(full.div.df,class.div.df)
    
    ## Extract the PULs diversity
    rawPULs.temp <- PULS.df[is.element(PULS.df$MAG,mags.arr.temp),]
    pulscounts.temp <- as.data.frame(table(rawPULs.temp$PULs))
    colnames(pulscounts.temp) <- c("PULs","frequency")
    pulscounts.temp <- pulscounts.temp[order(pulscounts.temp$frequency,decreasing = TRUE),]
}

head(full.div.df,39L)

##                     class frequency
## 1                     AC1        74
## 2          Acidobacteriae       279
## 3          Actinobacteria      1867
## 4     Alphaproteobacteria      1477
## 5            Archaeoglobi       165
## 6                 Bacilli      4020
## 7             Bacteroidia      7222
## 8         Bdellovibrionia        38
## 9         Campylobacteria       102
## 10           Chloroflexia        21
## 11          Cloacimonadia        48
## 12             Clostridia      3347
## 13    Coprothermobacteria        96
## 14         Coriobacteriia       651
## 15         Cyanobacteriia       111
## 16        Dehalococcoidia         5
## 17             Deinococci      2498
## 18     Desulfitobacteriia        79
## 19        Desulfobacteria       258
## 20       Desulfovibrionia       421
## 21           Dictyoglomia        69
## 22          Fusobacteriia       164
## 23    Gammaproteobacteria      6597
## 24           Halobacteria        70
## 25        Methanobacteria       284
## 26        Methanomicrobia        93
## 27        Methanosarcinia       311
## 28          Negativicutes      3138
## 29        Nitrososphaeria       155
## 30           Rhodothermia       116
## 31           Spirochaetia       648
## 32            Synergistia       438
## 33 Thermodesulfovibrionia       366
## 34         Thermoplasmata       103
## 35           Thermoprotei       211
## 36            Thermotogae       180
## 37                UBA5184        51
## 38       Verrucomicrobiae       399

print(paste0(sum(full.div.df$frequency)," total PULs"))

## [1] "36172 total PULs"

3.1.3 PULs Order counts

## Read the metadata and filter it
setwd(dir = '/home/chombo/Documents/SEGOVIA/CAZymes_PULs/data/')

## PULs per Order
order.arr <- MAGS.meta %>%
    distinct(order)
order.arr <- order.arr[order(order.arr$order,decreasing = FALSE),]

full.div.df <- data.frame()
for (ord in order.arr) {
    ## Extract all the MAGs annotated in each Order
    mags.arr.temp <- as.vector(MAGS.meta$genome_id[MAGS.meta$order == ord])
    
    ## For each MAG, extract all the PUL counts
    PULSperMAG.temp <- PULSperMAG.df[is.element(PULSperMAG.df$MAGs,mags.arr.temp),]    

    ## Extract the complete diversity
    order.div.df <- data.frame(order = c(ord),
                               frequency = sum(PULSperMAG.temp$PULs_Freq))
    full.div.df <- rbind(full.div.df,order.div.df)
    
    ## Extract the PULs diversity
    rawPULs.temp <- PULS.df[is.element(PULS.df$MAG,mags.arr.temp),]
    pulscounts.temp <- as.data.frame(table(rawPULs.temp$PULs))
    colnames(pulscounts.temp) <- c("PULs","frequency")
    pulscounts.temp <- pulscounts.temp[order(pulscounts.temp$frequency,decreasing = TRUE),]
}

head(full.div.df,81L)

##                       order frequency
## 1                       AC1        74
## 2          Acetivibrionales        68
## 3           Acetobacterales        48
## 4         Acidaminococcales      1343
## 5          Acidiprofundales        38
## 6          Acidobacteriales       279
## 7           Actinomycetales      1110
## 8           Archaeoglobales       165
## 9            Azospirillales        74
## 10               Bacillales        81
## 11            Bacteroidales      2771
## 12              Balneolales        59
## 13        Bdellovibrionales        38
## 14    Betaproteobacteriales      2553
## 15          Brevibacillales        50
## 16          Caldiarchaeales        32
## 17        Campylobacterales       102
## 18          Caulobacterales       439
## 19          Chitinophagales       571
## 20       Christensenellales        59
## 21          Cloacimonadales        48
## 22    Coprothermobacterales        96
## 23         Coriobacteriales       651
## 24        Corynebacteriales       379
## 25         Cyanobacteriales       111
## 26             Cytophagales       231
## 27        Dehalococcoidales         5
## 28            Deinococcales      2498
## 29     Desulfitobacteriales        79
## 30        Desulfobacterales       258
## 31       Desulfovibrionales       421
## 32           Dictyoglomales        69
## 33         Enterobacterales      1536
## 34       Erysipelotrichales      1887
## 35         Exiguobacterales       210
## 36         Flavobacteriales      3195
## 37          Fusobacteriales       164
## 38          Halobacteriales        70
## 39          Haloplasmatales        35
## 40           Lachnospirales      2316
## 41          Lactobacillales      1501
## 42       Methanobacteriales       284
## 43  Methanomassiliicoccales         9
## 44       Methanomicrobiales        93
## 45        Methanosarcinales       204
## 46         Methanotrichales       107
## 47                ML615J-28        17
## 48          Mycoplasmatales        35
## 49        Nitrososphaerales       123
## 50                 NS11-12g       103
## 51          Oscillospirales       708
## 52          Paenibacillales        33
## 53     Peptostreptococcales       146
## 54      Propionibacteriales       378
## 55          Pseudomonadales      2023
## 56                    RFN20         2
## 57              Rhizobiales       477
## 58          Rhodobacterales       154
## 59         Rhodospirillales        92
## 60           Rhodothermales        57
## 61          Selenomonadales       156
## 62           Sneathiellales        37
## 63         Sphaerochaetales         7
## 64       Sphingobacteriales       351
## 65         Sphingomonadales        69
## 66           Spirochaetales        33
## 67         Staphylococcales       169
## 68             Sulfolobales       211
## 69            Synergistales       438
## 70 Thermodesulfovibrionales       366
## 71        Thermomicrobiales        21
## 72        Thermoplasmatales        56
## 73            Thermotogales       180
## 74           Tissierellales        50
## 75           Treponematales       608
## 76                  UBA5184        51
## 77                   UBA998        87
## 78           Veillonellales      1639
## 79       Verrucomicrobiales       399
## 80          Xanthomonadales       485

print(paste0(sum(full.div.df$frequency)," total PULs"))

## [1] "36172 total PULs"

3.1.4 PULs Family counts

## Read the metadata and filter it
setwd(dir = '/home/chombo/Documents/SEGOVIA/CAZymes_PULs/data/')

## PULs per Family
fam.arr <- MAGS.meta %>%
    distinct(family)
fam.arr <- fam.arr[order(fam.arr$family,decreasing = FALSE),]

full.div.df <- data.frame()
for (fam in fam.arr) {
    ## Extract all the MAGs annotated in each Family
    mags.arr.temp <- as.vector(MAGS.meta$genome_id[MAGS.meta$family == fam])
    
    ## For each MAG, extract all the PUL counts
    PULSperMAG.temp <- PULSperMAG.df[is.element(PULSperMAG.df$MAGs,mags.arr.temp),]    
    
    ## Extract the complete diversity
    fam.div.df <- data.frame(family = c(fam),
                               frequency = sum(PULSperMAG.temp$PULs_Freq))
    full.div.df <- rbind(full.div.df,fam.div.df)
    
    ## Extract the PULs diversity
    rawPULs.temp <- PULS.df[is.element(PULS.df$MAG,mags.arr.temp),]
    pulscounts.temp <- as.data.frame(table(rawPULs.temp$PULs))
    colnames(pulscounts.temp) <- c("PULs","frequency")
    pulscounts.temp <- pulscounts.temp[order(pulscounts.temp$frequency,decreasing = TRUE),]
}

head(full.div.df,139L)

##                        family frequency
## 1                        1G12        25
## 2           Acetivibrionaceae        68
## 3            Acetobacteraceae        48
## 4          Acidaminococcaceae      1343
## 5           Acidobacteriaceae        26
## 6               Aerococcaceae        46
## 7             Akkermansiaceae       156
## 8            Alcanivoracaceae       158
## 9           Aminobacteriaceae        41
## 10           Anaerovoracaceae        73
## 11           Archaeoglobaceae       165
## 12               Atopobiaceae       123
## 13             Bacteroidaceae       301
## 14               Balneolaceae        59
## 15            Barnesiellaceae       884
## 16         Bdellovibrionaceae        38
## 17           Beijerinckiaceae       156
## 18         Bifidobacteriaceae       429
## 19           Brevibacillaceae        50
## 20           Burkholderiaceae      2424
## 21                    CAG-138        59
## 22                    CAG-698        17
## 23                    CAG-826         2
## 24           Caldibacillaceae        64
## 25         Campylobacteraceae        72
## 26           Caulobacteraceae       439
## 27          Cellulomonadaceae       367
## 28           Chitinophagaceae       571
## 29           Cloacimonadaceae        48
## 30           Coprobacteraceae         8
## 31     Coprothermobacteraceae        96
## 32          Coriobacteriaceae       297
## 33         Corynebacteriaceae       379
## 34          Crocinitomicaceae        47
## 35             Cryomorphaceae       246
## 36          Cyanobacteriaceae        51
## 37          Cyclobacteriaceae       166
## 38         Dehalococcoidaceae         5
## 39           Dermatophilaceae       102
## 40         Desulfobacteraceae       258
## 41        Desulfomicrobiaceae        22
## 42        Desulfovibrionaceae       399
## 43    Dethiosulfovibrionaceae        40
## 44                Devosiaceae       169
## 45            Dictyoglomaceae        69
## 46                     DTU089       125
## 47          Dysgonomonadaceae        59
## 48            Eggerthellaceae       231
## 49         Enterobacteriaceae      1402
## 50            Enterococcaceae       719
## 51  Erysipelatoclostridiaceae      1125
## 52        Erysipelotrichaceae       762
## 53                      EtOH8        74
## 54          Exiguobacteraceae       210
## 55          Ferrovibrionaceae        37
## 56        Fervidobacteriaceae       126
## 57          Flavobacteriaceae       442
## 58           Fusobacteriaceae       164
## 59                Hahellaceae        79
## 60             Haloferacaceae        70
## 61          Helicobacteraceae        30
## 62              Inquilinaceae        74
## 63                    JdFR-45        38
## 64            JGI-0000106-J15        32
## 65                    koll-22        56
## 66            Koribacteraceae       253
## 67            Lachnospiraceae      2316
## 68           Lactobacillaceae       619
## 69        Magnetospirillaceae        51
## 70             Marinifilaceae       283
## 71           Marinomonadaceae        60
## 72            Megasphaeraceae       969
## 73        Methanobacteriaceae       280
## 74   Methanomassiliicoccaceae         3
## 75    Methanomethylophilaceae         6
## 76        Methanomicrobiaceae        29
## 77          Methanoregulaceae        64
## 78         Methanosarcinaceae       204
## 79   Methanothermobacteraceae         4
## 80          Methanotrichaceae       107
## 81           Methylophilaceae        21
## 82          Microbacteriaceae       139
## 83             Micrococcaceae        73
## 84              Moraxellaceae      1600
## 85           Mycoplasmataceae        30
## 86         Mycoplasmataceae_B         5
## 87              Neisseriaceae        95
## 88          Nitrosopumilaceae        84
## 89           Oscillospiraceae        81
## 90           Paenibacillaceae        33
## 91          Paludibacteraceae       937
## 92            Pasteurellaceae       134
## 93           Peptoniphilaceae        50
## 94      Peptostreptococcaceae        73
## 95              Phormidiaceae        60
## 96             Planococcaceae        17
## 97         Porphyromonadaceae        43
## 98             Porticoccaceae         5
## 99       Propionibacteriaceae       378
## 100          Pseudomonadaceae       121
## 101              Rhizobiaceae       152
## 102          Rhodobacteraceae       154
## 103            Rhodocyclaceae        13
## 104           Rhodothermaceae        57
## 105             Rikenellaceae       231
## 106           Ruminococcaceae       502
## 107          Selenomonadaceae       156
## 108         Sphaerochaetaceae         7
## 109       Sphingobacteriaceae       351
## 110         Sphingomonadaceae        69
## 111         Spirochaetaceae_B        33
## 112             Spirosomaceae        65
## 113         Staphylococcaceae       169
## 114          Streptococcaceae       117
## 115             Sulfolobaceae       211
## 116            Synergistaceae       130
## 117       Syntrophobotulaceae        79
## 118            Tannerellaceae        25
## 119         Thalassospiraceae        41
## 120                Thermaceae      2498
## 121 Thermodesulfovibrionaceae       366
## 122        Thermomicrobiaceae        21
## 123        Thermoplasmataceae        56
## 124            Thermotogaceae        54
## 125           Thermovirgaceae       227
## 126           Treponemataceae       608
## 127         Turicibacteraceae        35
## 128                      UA16       119
## 129                    UBA213        39
## 130                   UBA3002        87
## 131                   UBA5184        51
## 132                   UBA7430        58
## 133                    UBA955        55
## 134                   UKL13-3        48
## 135           Veillonellaceae       670
## 136       Verrucomicrobiaceae       243
## 137             Weeksellaceae      2202
## 138          Xanthomonadaceae       485

print(paste0(sum(full.div.df$frequency)," total PULs"))

## [1] "36172 total PULs"

3.2 CAZymes counts

## CAZYMES COUNTS
## Filter the DBcan4 output
## e-value <= 1e-18
## coverage >= 0.8

setwd(dir = '/home/chombo/Documents/SEGOVIA/CAZymes_PULs/data/')

## Create a df with all the CAZy counts according to each MAG
CAZYS.df <- data.frame()
for (mag in MAGS.arr) {
    ## Filter the 5 MAGs that do not have significant CAZys associated with them
    if (mag == "3300007500_43") {
        next
    }
    
    ## Read the HMMER output for those matches with an evalue <= 1e-18 and a coverage >= 0.8
    df.temp <- read.table(file = paste(mag,"/output_dbcan4/hmmer.out",sep = ""),sep = "\t",skip = 1)
    colnames(df.temp) <- c("CAZy","length","geneID","genelength","evalue","start","end","genestart","geneend","coverage")
    df.temp <- df.temp %>%
        filter(evalue <= 1e-18,coverage >= 0.8) %>%
        select(CAZy,geneID,evalue,coverage)
    
    ## Extract the gene ID
    ids <- as.vector(df.temp$geneID)
    
    ## Read the overview.txt file to compare it with the hmmer.txt file
    df.temp <- read.table(paste(mag,"/output_dbcan4/overview.txt",sep = ""),header = FALSE,skip = 1)
    colnames(df.temp) <- c("geneID","EC","HMMER","dbCAN_sub","DIAMOND","Signalp","Tools")
    df.temp["MAG"] <- mag
    df.temp <- df.temp[is.element(df.temp$geneID,ids),]
    df.temp <- df.temp %>%
        filter(Tools >= 2) %>%
        select(MAG,HMMER,Signalp)
    df.temp$HMMER <- gsub("\\(.*?\\)", "", df.temp$HMMER)
    df.temp$Signalp <- gsub("\\(.*\\)", "",df.temp$Signalp)
    df.temp <- df.temp[df.temp$HMMER != "-",]
    
    ## Save the CAZYs frequency
    freq <- as.data.frame(table(df.temp$HMMER))
    freq <- freq[order(freq$Freq,decreasing = TRUE),]
    # write.table(freq,file = paste(mag,"/output_dbcan4/",mag,".CAZyCounts.txt",sep = ""),sep = "\t",quote = FALSE,col.names = FALSE,row.names = FALSE)
    
    ## Clean the CAZys and do another df
    df.temp$HMMER <- gsub("\\+.*", "", df.temp$HMMER)
    
    ## Save the CAZYs frequency
    freq <- as.data.frame(table(df.temp$HMMER))
    # freq <- freq[order(freq$Freq,decreasing = TRUE),]
    # write.table(freq,file = paste(mag,"/output_dbcan4/",mag,".CAZyCounts.single.txt",sep = ""),sep = "\t",quote = FALSE,col.names = FALSE,row.names = FALSE)
    
    if (length(df.temp) == 0) {
        print(mag)
    }
    
    ## Store them in a single df
    CAZYS.df <- rbind(CAZYS.df,df.temp)
}
colnames(CAZYS.df) <- c("MAG","CAZys","Signalp")

print(paste0(length(CAZYS.df$CAZys)," total CAZys"))

## [1] "61654 total CAZys"

print(paste0(length(unique(CAZYS.df$CAZys))," different CAZys"))

## [1] "424 different CAZys"

print(paste0(length(unique(CAZYS.df$MAG))," final MAGs"))

## [1] "1064 final MAGs"

## 61654 total CAZys
## 424 different CAZys
## 1064 final MAGs

## Get the summary
CAZysperMAG.df <- as.data.frame(table(CAZYS.df$MAG))
colnames(CAZysperMAG.df) <- c("MAGs","CAZys_Freq")
excludedMAGS <- data.frame(MAGs = c("3300007500_43","3300011404_8","3300012067_9","3300012151_17","3300012165_16"),
                           CAZys_Freq = c(0,0,0,0,0))
CAZysperMAG.df <- rbind(CAZysperMAG.df,excludedMAGS)

## Search for MAGs without CAZys
# setdiff(MAGS.arr,c(CAZysperMAG.df$MAGs))
# [1] "3300011404_8"  "3300012067_9"  "3300012151_17"
# [4] "3300012165_16"

3.2.1 CAZy Ecosystem counts

## Read the metadata and filter it
setwd(dir = '/home/chombo/Documents/SEGOVIA/CAZymes_PULs/data/')

## Read the MAGs metadata
MAGS.meta <- read.table(file = "1069_MAGs_metadata.tsv",sep = "\t",header = TRUE)
MAGS.meta <- MAGS.meta %>%
    select(genome_id,class,order,family,ecosystem_category)
MAGS.meta$ecosystem_category <- gsub(" ","_",MAGS.meta$ecosystem_category)

## Remove the excluded MAGs
MAGS.meta <- MAGS.meta[!is.element(MAGS.meta$genome_id,c("3300007500_43","3300011404_8","3300012067_9","3300012151_17","3300012165_16")),]

## CAZys per Ecosystem
eco.arr <- MAGS.meta %>%
    distinct(ecosystem_category)
eco.arr <- eco.arr[order(eco.arr$ecosystem_category,decreasing = FALSE),]

full.div.df <- data.frame()
for (eco in eco.arr) {
    ## Extract all the MAGs annotated in each Class
    mags.arr.temp <- as.vector(MAGS.meta$genome_id[MAGS.meta$ecosystem_category == eco])
    
    ## For each MAG, extract all the PUL counts
    CAZYsperMAG.temp <- CAZysperMAG.df[is.element(CAZysperMAG.df$MAGs,mags.arr.temp),]    
    
    ## Extract the complete diversity
    eco.div.df <- data.frame(order = c(eco),
                                frequency = sum(CAZYsperMAG.temp$CAZys_Freq))
    full.div.df <- rbind(full.div.df,eco.div.df)
    
    ## Extract the PULs diversity
    rawCAZys.temp <- CAZYS.df[is.element(CAZYS.df$MAG,mags.arr.temp),]
    cazyscounts.temp <- as.data.frame(table(rawCAZys.temp$CAZys))
    colnames(cazyscounts.temp) <- c("CAZys","frequency")
    cazyscounts.temp <- cazyscounts.temp[order(cazyscounts.temp$frequency,decreasing = TRUE),]
}

head(full.div.df,139L)

##                order frequency
## 1              Algae        50
## 2             Animal       913
## 3           Annelida        23
## 4            Aquatic      5705
## 5         Arthropoda      1948
## 6         Bioreactor       773
## 7     Bioremediation       132
## 8  Biotransformation        43
## 9              Birds       676
## 10 Built_environment      9614
## 11          Cnidaria        67
## 12             Fungi       144
## 13             Human     28160
## 14           Insecta      1100
## 15    Lab_enrichment      2029
## 16     Lab_synthesis        57
## 17           Mammals      1757
## 18         Microbial       130
## 19           Modeled        58
## 20            Plants       760
## 21       Solid_waste       227
## 22       Terrestrial      5814
## 23        Wastewater      1474

print(paste0(sum(full.div.df$frequency)," total CAZymes"))

## [1] "61654 total CAZymes"

3.2.2 CAZy Class counts

## Read the metadata and filter it
setwd(dir = '/home/chombo/Documents/SEGOVIA/CAZymes_PULs/data/')

## CAZys per Class
class.arr <- MAGS.meta %>%
    distinct(class)
class.arr <- class.arr[order(class.arr$class,decreasing = FALSE),]

full.div.df <- data.frame()
for (class in class.arr) {
    ## Extract all the MAGs annotated in each Class
    mags.arr.temp <- as.vector(MAGS.meta$genome_id[MAGS.meta$class == class])
    
    ## For each MAG, extract all the PUL counts
    CAZYsperMAG.temp <- CAZysperMAG.df[is.element(CAZysperMAG.df$MAGs,mags.arr.temp),]    
    
    ## Extract the complete diversity
    class.div.df <- data.frame(class = c(class),
                               frequency = sum(CAZYsperMAG.temp$CAZys_Freq))
    full.div.df <- rbind(full.div.df,class.div.df)
    
    ## Extract the PULs diversity
    rawCAZys.temp <- CAZYS.df[is.element(CAZYS.df$MAG,mags.arr.temp),]
    cazyscounts.temp <- as.data.frame(table(rawCAZys.temp$CAZys))
    colnames(cazyscounts.temp) <- c("CAZys","frequency")
    cazyscounts.temp <- cazyscounts.temp[order(cazyscounts.temp$frequency,decreasing = TRUE),]
}

head(full.div.df,139L)

##                     class frequency
## 1                     AC1       186
## 2          Acidobacteriae       198
## 3          Actinobacteria      6152
## 4     Alphaproteobacteria      2219
## 5            Archaeoglobi        26
## 6                 Bacilli      6473
## 7             Bacteroidia     23498
## 8         Bdellovibrionia        43
## 9         Campylobacteria       107
## 10           Chloroflexia        76
## 11          Cloacimonadia        24
## 12             Clostridia      5845
## 13    Coprothermobacteria        52
## 14         Coriobacteriia       583
## 15         Cyanobacteriia       127
## 16        Dehalococcoidia        18
## 17             Deinococci      1713
## 18     Desulfitobacteriia        21
## 19        Desulfobacteria       180
## 20       Desulfovibrionia       378
## 21           Dictyoglomia       157
## 22          Fusobacteriia        97
## 23    Gammaproteobacteria      8501
## 24           Halobacteria        26
## 25        Methanobacteria       179
## 26        Methanomicrobia        28
## 27        Methanosarcinia       131
## 28          Negativicutes      2442
## 29        Nitrososphaeria        30
## 30           Rhodothermia       620
## 31           Spirochaetia       467
## 32            Synergistia       270
## 33 Thermodesulfovibrionia       287
## 34         Thermoplasmata        69
## 35           Thermoprotei        93
## 36            Thermotogae        97
## 37                UBA5184        52
## 38       Verrucomicrobiae       189

print(paste0(sum(full.div.df$frequency)," total CAZymes"))

## [1] "61654 total CAZymes"

3.2.3 CAZy Order counts

## Read the metadata and filter it
setwd(dir = '/home/chombo/Documents/SEGOVIA/CAZymes_PULs/data/')

## CAZys per Order
order.arr <- MAGS.meta %>%
    distinct(order)
order.arr <- order.arr[order(order.arr$order,decreasing = FALSE),]

full.div.df <- data.frame()
for (ord in order.arr) {
    ## Corresponds to the MAG without counts
    if (ord == "Mycoplasmatales") {
        next
    }
  
    ## Extract all the MAGs annotated in each Class
    mags.arr.temp <- as.vector(MAGS.meta$genome_id[MAGS.meta$order == ord])
    
    ## For each MAG, extract all the CAZy counts
    CAZYsperMAG.temp <- CAZysperMAG.df[is.element(CAZysperMAG.df$MAGs,mags.arr.temp),]    
    
    ## Extract the complete diversity
    order.div.df <- data.frame(order = c(ord),
                               frequency = sum(CAZYsperMAG.temp$CAZys_Freq))
    full.div.df <- rbind(full.div.df,order.div.df)
    
    ## Extract the CAZys diversity
    rawCAZys.temp <- CAZYS.df[is.element(CAZYS.df$MAG,mags.arr.temp),]
    cazyscounts.temp <- as.data.frame(table(rawCAZys.temp$CAZys))
    colnames(cazyscounts.temp) <- c("CAZys","frequency")
    cazyscounts.temp <- cazyscounts.temp[order(cazyscounts.temp$frequency,decreasing = TRUE),]
}

head(full.div.df,139L)

##                       order frequency
## 1                       AC1       186
## 2          Acetivibrionales       102
## 3           Acetobacterales        97
## 4         Acidaminococcales       705
## 5          Acidiprofundales         7
## 6          Acidobacteriales       198
## 7           Actinomycetales      4931
## 8           Archaeoglobales        26
## 9            Azospirillales        58
## 10               Bacillales        98
## 11            Bacteroidales     12264
## 12              Balneolales       109
## 13        Bdellovibrionales        43
## 14    Betaproteobacteriales      2189
## 15          Brevibacillales        56
## 16          Caldiarchaeales         5
## 17        Campylobacterales       107
## 18          Caulobacterales       847
## 19          Chitinophagales      2875
## 20       Christensenellales        43
## 21          Cloacimonadales        24
## 22    Coprothermobacterales        52
## 23         Coriobacteriales       583
## 24        Corynebacteriales       557
## 25         Cyanobacteriales       127
## 26             Cytophagales       816
## 27        Dehalococcoidales        18
## 28            Deinococcales      1713
## 29     Desulfitobacteriales        21
## 30        Desulfobacterales       180
## 31       Desulfovibrionales       378
## 32           Dictyoglomales       157
## 33         Enterobacterales      2910
## 34       Erysipelotrichales      2358
## 35         Exiguobacterales       542
## 36         Flavobacteriales      6209
## 37          Fusobacteriales        97
## 38          Halobacteriales        26
## 39          Haloplasmatales        88
## 40           Lachnospirales      4190
## 41          Lactobacillales      2848
## 42       Methanobacteriales       179
## 43  Methanomassiliicoccales        45
## 44       Methanomicrobiales        28
## 45        Methanosarcinales        96
## 46         Methanotrichales        35
## 47                ML615J-28        23
## 48        Nitrososphaerales        25
## 49                 NS11-12g        99
## 50          Oscillospirales      1397
## 51          Paenibacillales       214
## 52     Peptostreptococcales       103
## 53      Propionibacteriales       664
## 54          Pseudomonadales      2151
## 55                    RFN20        46
## 56              Rhizobiales       717
## 57          Rhodobacterales       116
## 58         Rhodospirillales       167
## 59           Rhodothermales       511
## 60          Selenomonadales       284
## 61           Sneathiellales        33
## 62         Sphaerochaetales        60
## 63       Sphingobacteriales      1235
## 64         Sphingomonadales       157
## 65           Spirochaetales        23
## 66         Staphylococcales       200
## 67             Sulfolobales        93
## 68            Synergistales       270
## 69 Thermodesulfovibrionales       287
## 70        Thermomicrobiales        76
## 71        Thermoplasmatales        17
## 72            Thermotogales        97
## 73           Tissierellales        10
## 74           Treponematales       384
## 75                  UBA5184        52
## 76                   UBA998        27
## 77           Veillonellales      1453
## 78       Verrucomicrobiales       189
## 79          Xanthomonadales      1251

print(paste0(sum(full.div.df$frequency)," total CAZymes"))

## [1] "61654 total CAZymes"

3.2.4 CAZy Family counts

## Read the metadata and filter it
setwd(dir = '/home/chombo/Documents/SEGOVIA/CAZymes_PULs/data/')

## CAZys per Family
fam.arr <- MAGS.meta %>%
    distinct(family)
fam.arr <- fam.arr[order(fam.arr$family,decreasing = FALSE),]

full.div.df <- data.frame()
for (fam in fam.arr) {
    ## Extract all the MAGs annotated in each Class
    mags.arr.temp <- as.vector(MAGS.meta$genome_id[MAGS.meta$family == fam])
    
    ## For each MAG, extract all the PUL counts
    CAZYsperMAG.temp <- CAZysperMAG.df[is.element(CAZysperMAG.df$MAGs,mags.arr.temp),]    
    
    ## Extract the complete diversity
    family.div.df <- data.frame(order = c(fam),
                               frequency = sum(CAZYsperMAG.temp$CAZys_Freq))
    full.div.df <- rbind(full.div.df,family.div.df)
    
    ## Extract the PULs diversity
    rawCAZys.temp <- CAZYS.df[is.element(CAZYS.df$MAG,mags.arr.temp),]
    cazyscounts.temp <- as.data.frame(table(rawCAZys.temp$CAZys))
    colnames(cazyscounts.temp) <- c("CAZys","frequency")
    cazyscounts.temp <- cazyscounts.temp[order(cazyscounts.temp$frequency,decreasing = TRUE),]
}

head(full.div.df,139L)

##                         order frequency
## 1                        1G12        67
## 2           Acetivibrionaceae       102
## 3            Acetobacteraceae        97
## 4          Acidaminococcaceae       705
## 5           Acidobacteriaceae       112
## 6               Aerococcaceae        33
## 7             Akkermansiaceae        83
## 8            Alcanivoracaceae       178
## 9           Aminobacteriaceae        40
## 10           Anaerovoracaceae        56
## 11           Archaeoglobaceae        26
## 12               Atopobiaceae       141
## 13             Bacteroidaceae      2208
## 14               Balneolaceae       109
## 15            Barnesiellaceae      6664
## 16         Bdellovibrionaceae        43
## 17           Beijerinckiaceae       246
## 18         Bifidobacteriaceae      1897
## 19           Brevibacillaceae        56
## 20           Burkholderiaceae      1887
## 21                    CAG-138        43
## 22                    CAG-698        23
## 23                    CAG-826        46
## 24           Caldibacillaceae        48
## 25         Campylobacteraceae        85
## 26           Caulobacteraceae       847
## 27          Cellulomonadaceae      2565
## 28           Chitinophagaceae      2875
## 29           Cloacimonadaceae        24
## 30           Coprobacteraceae       156
## 31     Coprothermobacteraceae        52
## 32          Coriobacteriaceae       275
## 33         Corynebacteriaceae       557
## 34          Crocinitomicaceae        94
## 35             Cryomorphaceae       269
## 36          Cyanobacteriaceae        51
## 37          Cyclobacteriaceae       622
## 38         Dehalococcoidaceae        18
## 39           Dermatophilaceae       163
## 40         Desulfobacteraceae       180
## 41        Desulfomicrobiaceae        37
## 42        Desulfovibrionaceae       341
## 43    Dethiosulfovibrionaceae        21
## 44                Devosiaceae       294
## 45            Dictyoglomaceae       157
## 46                     DTU089       288
## 47          Dysgonomonadaceae       506
## 48            Eggerthellaceae       167
## 49         Enterobacteriaceae      2726
## 50            Enterococcaceae      1550
## 51  Erysipelatoclostridiaceae      1546
## 52        Erysipelotrichaceae       812
## 53                      EtOH8       186
## 54          Exiguobacteraceae       542
## 55          Ferrovibrionaceae        33
## 56        Fervidobacteriaceae        60
## 57          Flavobacteriaceae      2142
## 58           Fusobacteriaceae        97
## 59                Hahellaceae       157
## 60             Haloferacaceae        26
## 61          Helicobacteraceae        22
## 62              Inquilinaceae        58
## 63                    JdFR-45         7
## 64            JGI-0000106-J15         5
## 65                    koll-22       149
## 66            Koribacteraceae        86
## 67            Lachnospiraceae      4190
## 68           Lactobacillaceae       966
## 69        Magnetospirillaceae        45
## 70             Marinifilaceae       193
## 71           Marinomonadaceae       100
## 72            Megasphaeraceae       817
## 73        Methanobacteriaceae       161
## 74   Methanomassiliicoccaceae        35
## 75    Methanomethylophilaceae        10
## 76        Methanomicrobiaceae        20
## 77          Methanoregulaceae         8
## 78         Methanosarcinaceae        96
## 79   Methanothermobacteraceae        18
## 80          Methanotrichaceae        35
## 81           Methylophilaceae       126
## 82          Microbacteriaceae       207
## 83             Micrococcaceae        99
## 84              Moraxellaceae      1500
## 85              Neisseriaceae       144
## 86          Nitrosopumilaceae        17
## 87           Oscillospiraceae        75
## 88           Paenibacillaceae       214
## 89          Paludibacteraceae      1657
## 90            Pasteurellaceae       184
## 91           Peptoniphilaceae        10
## 92      Peptostreptococcaceae        47
## 93              Phormidiaceae        76
## 94             Planococcaceae        50
## 95         Porphyromonadaceae        67
## 96             Porticoccaceae        20
## 97       Propionibacteriaceae       664
## 98           Pseudomonadaceae       196
## 99               Rhizobiaceae       177
## 100          Rhodobacteraceae       116
## 101            Rhodocyclaceae        32
## 102           Rhodothermaceae       511
## 103             Rikenellaceae       506
## 104           Ruminococcaceae      1034
## 105          Selenomonadaceae       284
## 106         Sphaerochaetaceae        60
## 107       Sphingobacteriaceae      1235
## 108         Sphingomonadaceae       157
## 109         Spirochaetaceae_B        23
## 110             Spirosomaceae       194
## 111         Staphylococcaceae       200
## 112          Streptococcaceae       299
## 113             Sulfolobaceae        93
## 114            Synergistaceae       100
## 115       Syntrophobotulaceae        21
## 116            Tannerellaceae       307
## 117         Thalassospiraceae       122
## 118                Thermaceae      1713
## 119 Thermodesulfovibrionaceae       287
## 120        Thermomicrobiaceae        76
## 121        Thermoplasmataceae        17
## 122            Thermotogaceae        37
## 123           Thermovirgaceae       109
## 124           Treponemataceae       384
## 125         Turicibacteraceae        88
## 126                      UA16        53
## 127                    UBA213         8
## 128                   UBA3002        27
## 129                   UBA5184        52
## 130                   UBA7430        27
## 131                    UBA955        33
## 132                   UKL13-3        66
## 133           Veillonellaceae       636
## 134       Verrucomicrobiaceae       106
## 135             Weeksellaceae      3408
## 136          Xanthomonadaceae      1251

print(paste0(sum(full.div.df$frequency)," total CAZymes"))

## [1] "61654 total CAZymes"

4. Principal Coordinates Analysis (PCoA)

The approach was create a data frame with the PULs diversity per MAG (IMPORTANT: diversity is DIFFERENT from frequency) because we wanted to see which MAGs had the most variety of PULs. This was also made with the CAZymes and first. Doing the filtering, we obtained a reduced matrix and obtained their distances with the vegdist(mtx,"bray") function from the vegan package. Finally, we plot the PCoA by their Ecosystem, Class, Order and Family for PULs and CAZymes.

PULs code

## DATA PREPARATION
## Extracting all the metadata for the subsecuent analysis
## There are no other metadata dataframes than the extracted here
## Set working directory
setwd("/home/chombo/Documents/SEGOVIA/CAZymes_PULs/data/")

## Read the MAGs metadata and extract the columns: class, order, family, ecosystem
MAGS.meta <- read.table(file = "1069_MAGs_metadata.tsv",sep = "\t",header = TRUE,row.names = 1)
MAGS.meta <- MAGS.meta %>%
    select(class,order,family,ecosystem_category)
colnames(MAGS.meta) <- c("Class","Order","Family","Ecosystem")
MAGS.meta$Ecosystem <- str_replace(MAGS.meta$Ecosystem," ","_")

## Read the PULs per MAG metadata
PULSperMAG <- read.table(file = "PULsperMAG.tsv",sep = "\t",header = FALSE,row.names = 1)
colnames(PULSperMAG) <- c("PULs")

## Merge both data frames
MAGS.meta <- merge(MAGS.meta, PULSperMAG,
                          by = 'row.names', all = TRUE)

## Import the data for a matrix
MAGS.arr <- as.vector(read.table("1069MAGS_list.txt"))[[1]]


## Extract the PULs that are present >= 25 MAGs
all.puls <- data.frame()
for (mag in MAGS.arr) {
    ## Filter the 5 MAGs that do not have significant PULs associated with them
    if (mag == "3300005326_35" || mag == "3300005370_1" || mag == "3300017650_9" || mag == "3300027284_65" || mag == "3300027607_50") {
        next
    }
    
    df.temp <- read.table(file = paste(mag,"/",mag,".matches_counts.out",sep = ""),sep = "\t",)
    colnames(df.temp) <- c("pul","frequency")
    df.temp["mag"] <- mag
    df.temp <- df.temp %>%
        select(mag,pul,everything())
    
    ## Create a dataframe with the nodes: mag = from, pul = to, freq = value
    all.puls <- rbind(all.puls,df.temp)
}


## Extract the PULs counts per MAG
mags.data <- read.table(file = "PULsperMAG.tsv",sep = "\t",header = FALSE)
colnames(mags.data) <- c("mag","Freq")
# summary(mags.data$Freq)
# Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
# 0.00   27.00   48.00   57.67   73.00  273.00 
mags.data.cut <- mags.data[mags.data$Freq >= 48,]
# length(mags.data.cut$mag)
# [1] 240
filtered.mags.vec <- as.vector(mags.data.cut$mag)

## Extract PULs diversity
puls.data <- read.table(file = "PULS.diversity.tsv",sep = "\t",header = FALSE)
colnames(puls.data) <- c("puls","Freq")
# summary(puls.data$Freq)
# Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
# 1.00    2.00    6.00   26.54   19.50  988.00 
puls.data.cut <- puls.data[puls.data$Freq >= 27,]
# length(puls.data.cut$puls)
# [1] 285
filtered.puls.vec <- as.vector(puls.data.cut$puls)

# ===================================================================
## At the end we will be working with a matrix 445 MAGs x 303 PULs
# ===================================================================


## UNIVERSAL MATRIX GENERATION
## Use the current data
## Filter the original data by the filtered vectors
matrix.data <- all.puls[is.element(all.puls$mag,filtered.mags.vec),]
matrix.data <- matrix.data[is.element(matrix.data$pul,filtered.puls.vec),]

## Generate the matrix
mtx <- matrix.data %>%
    spread(pul, frequency) %>%
    column_to_rownames('mag') %>%
    as.matrix() %>%
    replace(is.na(.), 0)

CAZymes code

## DATA PREPARATION
## Extracting all the metadata for the subsecuent analysis
## There are no other metadata dataframes than the extracted here
## Set working directory
setwd("/home/chombo/Documents/SEGOVIA/CAZymes_PULs/data/")

## Read the MAGs metadata and extract the columns: class, order, family, ecosystem
MAGS.meta <- read.table(file = "1069_MAGs_metadata.tsv",sep = "\t",header = TRUE,row.names = 1)
MAGS.meta <- MAGS.meta %>%
    select(class,order,family,ecosystem_category)
colnames(MAGS.meta) <- c("Class","Order","Family","Ecosystem")
MAGS.meta$Ecosystem <- str_replace(MAGS.meta$Ecosystem," ","_")

## Read the PULs per MAG metadata
CAZYSperMAG <- read.table(file = "CAZysperMAG.tsv",sep = "\t",header = FALSE,row.names = 1)
colnames(CAZYSperMAG) <- c("CAZys")

## Merge both data frames
MAGS.meta <- merge(MAGS.meta, CAZYSperMAG,
                          by = 'row.names', all = TRUE)

## Import the data for a matrix
MAGS.arr <- as.vector(read.table("1069MAGS_list.txt"))[[1]]


## Extract the CAZys
all.cazys <- data.frame()
for (mag in MAGS.arr) {
    ## Filter the 5 MAGs that do not have significant CAZys associated with them
    if (mag == "3300007500_43" || mag == "3300011404_8" || mag == "3300012067_9" || mag == "3300012151_17" || mag == "3300012165_16" || mag == "3300007500_43") {
        next
    }
    
    df.temp <- read.table(file = paste(mag,"/output_dbcan4/",mag,".CAZyCounts.single.txt",sep = ""),sep = "\t",)
    colnames(df.temp) <- c("cazy","frequency")
    df.temp["mag"] <- mag
    df.temp <- df.temp %>%
        select(mag,cazy,everything())
    
    ## Create a dataframe with the nodes: mag = from, cazy = to, freq = value
    all.cazys <- rbind(all.cazys,df.temp)
}

## Extract the CAZys counts per MAG
mags.data <- read.table(file = "CAZysperMAG.tsv",sep = "\t",header = FALSE)
colnames(mags.data) <- c("mag","Freq")
# summary(mags.data$Freq)
# Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
# 0.00   27.00   48.00   57.67   73.00  273.00 
mags.data.cut <- mags.data[mags.data$Freq >= 48,]
# length(mags.data.cut$Var1)
# [1] 539
filtered.mags.vec <- as.vector(mags.data.cut$mag)

## Extract CAZys diversity
cazys.data <- read.table(file = "CAZYS.diversity.tsv",sep = "\t",header = FALSE)
colnames(cazys.data) <- c("cazy","Freq")
# summary(cazys.data$Freq)
# Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
# 1.0     6.0    27.0   145.4   108.2  5928.0 
cazys.data.cut <- cazys.data[cazys.data$Freq >= 27,]
# length(cazys.data.cut$Var1)
# [1] 214
filtered.cazys.vec <- as.vector(cazys.data.cut$cazy)

# ===================================================================
## At the end we will be working with a matrix 539 MAGs x 214 CAZys
# ===================================================================


## UNIVERSAL MATRIX GENERATION
## Use the current data
## Filter the original data by the filtered vectors
matrix.data <- all.cazys[is.element(all.cazys$mag,filtered.mags.vec),]
matrix.data <- matrix.data[is.element(matrix.data$cazy,filtered.cazys.vec),]

## Generate the matrix
mtx <- matrix.data %>%
    spread(cazy, frequency) %>%
    column_to_rownames('mag') %>%
    as.matrix() %>%
    replace(is.na(.), 0)

4.1 Filtering

After analyzing the obtained clusters, we decided to trim the data, staying only with the most significant clusters. This was only made with the PULs (since the PULs is our object of study) and the suggested approach is that the PULs cutoff PCoA distribution will be similar in the CAZymes with the same MAGs. Here is the code where we trim by ‘Class’, ‘Order’ and ‘Family’. It is important to remark that the ‘Ecosystem’ distribution was poorly well distributed. At the end, there were only 343 MAGs left. And those are the ones we will be working with from now on.

4.1.1 PULs

4.1.1.1 Ecosystem

Ecosystem before filtering

Ecosystem after filtering

4.1.1.2 Class

Class before filtering

Class after filtering

4.1.1.3 Order

Order before filtering

Order after filtering

4.1.1.4 Family

Family before filtering

Family after filtering

4.1.2 CAZymes

4.1.2.1 Ecosystem

Ecosystem before filtering

Ecosystem after filtering

4.1.2.2 Class

Class before filtering

Class after filtering

4.1.2.3 Order

Order before filtering

Order after filtering

4.1.2.4 Family

Family before filtering

Family after filtering

5. Distribution

After the final filtering. The 343 MAGs were stored in a matrix against the PULs and the CAZymes to see how were they distributed. For that, hclust() function was used as well as the vegdist(mtx,"bray") function for obtaining the distances. Here are the obtained distributions.

## Libraries
suppressPackageStartupMessages(library(dplyr))
suppressPackageStartupMessages(library(pheatmap))
suppressPackageStartupMessages(library(tidyr))
suppressPackageStartupMessages(library(textshape))
suppressPackageStartupMessages(library(stringr))
suppressPackageStartupMessages(library(ggplot2))
suppressPackageStartupMessages(library(vegan))
suppressPackageStartupMessages(library(dendextend))
suppressPackageStartupMessages(library(circlize))

## FUNCTIONS
## EXTRACT CAZYMES
get.cazymes <- function(mags) {
    all.cazys <- data.frame()
    for (mag in mags) {
        ## Filter the 5 MAGs that do not have significant PULs associated with them
        if (mag == "3300007500_43" || mag == "3300011404_8" || mag == "3300012067_9" || mag == "3300012151_17" || mag == "3300012165_16" || mag == "3300007500_43") {
            next
        }
        
        df.temp <- read.table(file = paste(mag,"/output_dbcan4/",mag,".CAZyCounts.single.txt",sep = ""),sep = "\t",)
        colnames(df.temp) <- c("cazy","frequency")
        df.temp["mag"] <- mag
        df.temp <- df.temp %>%
            select(mag,cazy,everything())
        
        ## Create a dataframe with the nodes: mag = from, cazy = to, freq = value
        all.cazys <- rbind(all.cazys,df.temp)
    }
    return(all.cazys)
}

## EXTRACT PULS
get.puls <- function(mags) {
    all.puls <- data.frame()
    for (mag in mags) {
        # ## Filter the 5 MAGs that do not have significant PULs associated with them
        if (mag == "3300005326_35" || mag == "3300005370_1" || mag == "3300017650_9" || mag == "3300027284_65" || mag == "3300027607_50") {
            next
        }
        
        df.temp <- read.table(file = paste(mag,"/",mag,".matches_counts.out",sep = ""),sep = "\t",)
        colnames(df.temp) <- c("pul","frequency")
        df.temp["mag"] <- mag
        df.temp <- df.temp %>%
            select(mag,pul,everything())
        
        ## Create a dataframe with the nodes: mag = from, pul = to, freq = value
        all.puls <- rbind(all.puls,df.temp)
    }
    return(all.puls)
}

## DATA PREPARATION
## Extracting all the metadata for the subsecuent analysis
## There are no other metadata dataframes than the extracted here
## Set working directory
setwd("/home/chombo/Documents/SEGOVIA/CAZymes_PULs/data/")

## Read the MAGs metadata and extract the columns: class, order, family, ecosystem
MAGS.meta <- read.table(file = "1069_MAGs_metadata.tsv",sep = "\t",header = TRUE,row.names = 1)
MAGS.meta <- MAGS.meta %>%
    select(PULsFrequency,CAZysFrequency,class,order,family,specie,ecosystem_category)
colnames(MAGS.meta) <- c("PULS","CAZYS","Class","Order","Family","Specie","Ecosystem")
MAGS.meta$Ecosystem <- str_replace(MAGS.meta$Ecosystem," ","_")
MAGS.meta["mag"] <- rownames(MAGS.meta)

## Read the 343 MAGs new list
MAGS.arr <- as.vector(read.table("343_MAGslist.txt"))[[1]]

## Filter metadata
MAGS.meta <- MAGS.meta[is.element(MAGS.meta$mag,MAGS.arr),]

## Extract the counts
all.cazys <- get.cazymes(mags = MAGS.arr)

## Create the matrix
mtx <- all.cazys %>%
    spread(cazy, frequency) %>%
    column_to_rownames('mag') %>%
    as.matrix() %>%
    replace(is.na(.), 0)

#mtx <- vegdist(mtx,"bray")
dend <- as.dendrogram(hclust(vegdist(mtx,"bray")))

## Colors
dend <- dend %>%
    color_branches(k = 19) %>%
    raise.dendrogram (15) %>%
    hang.dendrogram()

## Control quality plot
ggplot(dend, labels = FALSE) + scale_y_reverse(expand = c(0.2, 0)) + coord_polar(theta="x")
png(filename = "CAZy_dend_test.png",width = 4000,height = 2000,res = 100)
factoextra::fviz_dend(dend, cex = 0.5, k = 20, 
          color_labels_by_k = TRUE, rect = TRUE)
dev.off()

## Write the tree
phy <- ape::as.phylo(hclust(vegdist(mtx,"bray")))
ape::write.tree(phy = phy,file='CAZy343_ward_Class.tree') 

## Repeat the whole process with PULs
all.puls <- get.puls(mags = MAGS.arr)

## Create the matrix
mtx <- all.puls %>%
    spread(pul, frequency) %>%
    column_to_rownames('mag') %>%
    as.matrix() %>%
    replace(is.na(.), 0)

#mtx <- vegdist(mtx,"bray")
dend <- as.dendrogram(hclust(vegdist(mtx,"bray")))

## Colors
dend <- dend %>%
    color_branches(k = 19) %>%
    raise.dendrogram (15) %>%
    hang.dendrogram()

## Control quality plot
ggplot(dend, labels = FALSE) + scale_y_reverse(expand = c(0.2, 0)) + coord_polar(theta="x")
png(filename = "PULs_dend_test.png",width = 4000,height = 2000,res = 100)
factoextra::fviz_dend(dend, cex = 0.5, k = 20, 
                      color_labels_by_k = TRUE, rect = TRUE)
dev.off()

## Write the tree
phy <- ape::as.phylo(hclust(vegdist(mtx,"bray")))
ape::write.tree(phy = phy,file='PULs343_ward_Class.tree')

purple: Gammaproteobacteria
violet: Clostridia
red: Bacteroidia
yellow: Negativicutes
green: Deinococci
blue: Bacilli

PULs

CAZymes

5.1 CAZyme families per Family

The distribution fo CAZyme families per Family.

6. Substrates

Here are the substrates per Family identified for the PULs. The cutoff for the substrates was >= 100 counts from total. (A cutoff by the median was also evaluated. There is no meaningful difference between the cutoff by 100 and by the median).

PULs substrate got from: https://bcb.unl.edu/static/DBCAN-PUL/dbCAN-PUL_v5_05-10-2023.xlsx
CAZymes substrate got from: https://bcb.unl.edu/dbCAN_sub/data/fam-substrate-mapping-08252022.tsv

6.1 Code

## SUBSTRATES
## PULS
## Load the PULs substrate dataset
puls.substrate <- read_excel("PULSUB_05102023.xlsx")
puls.substrate <- puls.substrate %>%
    select(PULID,substrate_final,cazymes_predicted_dbcan)
colnames(puls.substrate) <- c("pul","substrate","predictedCAZY")

## CLEAN MAGS BY THEIR SUBSTRATE
all.puls <- merge(all.puls,puls.substrate, by = "pul",all.x = TRUE) %>%
    select(!c("predictedCAZY"))
all.puls$substrate <- replace_na(all.puls$substrate,"Unknown")
all.puls <- all.puls %>%
    filter(substrate != "Unknown")

# ================================================================================
## Unknown PULs = 7812
## Left PULs = 7938
## MAGs = 343
## Substrates = 143 w/Unknown
# ================================================================================

substrates.unique <- all.puls %>%
    filter(substrate != "Unknown") %>%
    group_by(substrate) %>%
    summarize(counts = sum(frequency))
## summary(substrates.unique$counts)
# Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
# 1.00    7.00   24.50   66.92   74.75  651.00

substrates.unique <- substrates.unique %>%
    filter(counts >= 24)
substrates.arr <- c(substrates.unique$substrate)
all.puls <- all.puls[is.element(all.puls$substrate,substrates.arr),]


## CAZYMES
## Working with CAZymes is a little bit different because they have their own substrate binding module
## i.e., they are associated with an specific substrate. That is why the approach will be to group in
## a list all the possible substrates and substrate binding modules by each of the 175 available CAZymes
## in the 343 MAGs
## Load the CAZymes substrate dataset
cazymes.substrate <- read.table("CAZYSUB_08252022.tsv", sep = "\t", header = TRUE)
cazymes.substrate <- cazymes.substrate %>%
    select(Family,Substrate_high_level,Name)
colnames(cazymes.substrate) <- c("cazy","substrate","enzyme")

## CLEAN MAGS BY THEIR SUBSTRATE
## Create an array with te unique CAZymes of the 343 MAGs
all.cazys.copy <- all.cazys
all.cazys.copy$cazy <- gsub("_\\d+","",all.cazys.copy$cazy)

## Remove the GTs
all.cazys.copy <- all.cazys.copy %>%
    filter(!grepl("GT",cazy))
unique.cazys <- c(sort(unique(all.cazys.copy$cazy)))

## Create a list with the metadata of all the unique CAZymes
cazys.sub.list <- list()
cazy.sub.unknown <- c()
for (i in unique.cazys) {
    df.temp <- subset(cazymes.substrate,is.element(cazymes.substrate$cazy,i))
    cazys.sub.list[[i]] <- df.temp
    
    ## Store the CAZymes with unknown substrates for further analysis
    if (length(row.names(df.temp)) == 0) {
        cazy.sub.unknown <- c(cazy.sub.unknown,i)
    }
}
## Some CAZymes were not found in the substrate dataset 
## They were searched manually
cazys.sub.list[["PL27"]] <- data.frame(cazy = c("PL27","PL27"),
                                   substrate = c("L-rhamnose","D-gluconarate"),
                                   enzyme = c("L-rhamnose-α-1","4-D-glucuronate lyase"))
cazys.sub.list[["PL35"]] <- data.frame(cazy = c("PL35"),
                                   substrate = c("chondroitin"),
                                   enzyme = c("Chondroitin lyase"))
cazys.sub.list[["PL40"]] <- data.frame(cazy = c("PL40"),
                                   substrate = c("ulvan"),
                                   enzyme = c("Ulvan lyase"))

## Merge the CAZymes with the substrates
## Extract the substrates from each cazy data frame and merge them so they can be incorporated to the all.cazys data frame
trans.cazys.sub <- data.frame()
for (cazy in unique.cazys) {
    sub.vec <- sort(unique(cazys.sub.list[[cazy]]$substrate))
    sub.str <- paste(sub.vec,collapse = ",")
    trans.cazys.sub <- rbind(trans.cazys.sub,data.frame(cazy = cazy,substrate = sub.str))
}

## Merge the substrates with the all.cazys data frame
all.cazys.copy <- merge(all.cazys.copy,trans.cazys.sub,
                        by = "cazy",all.x = TRUE)

# ================================================================================
## GT CAZymes = 6647
## Left CAZymes = 11547
## MAGs = 343
## Substrates = 87
# ================================================================================

## Filter the substrates by their median
substrates.unique <- all.cazys.copy %>%
    group_by(substrate) %>%
    summarize(counts = sum(frequency))
# summary(substrates.unique$counts)
# Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
# 1.0     9.5    54.0   132.7   137.0  1036.0

substrates.unique <- substrates.unique %>%
    filter(counts >= 54)
substrates.arr <- c(substrates.unique$substrate)
all.cazys.copy <- all.cazys.copy[is.element(all.cazys.copy$substrate,substrates.arr),]

6.2 PULs

PULs substrates were obtained by merging the substrate metadata got from dbCAN. The cutoff value was 24 which corresponds to the median of the PULs frequency.

6.3 CAZymes

CAZymes substrates were obtained by merging the substrate metadata got from dbCAN and then exclude the GT family. The cutoff value was 54 which corresponds to the median of the CAZymes frequency.

Workflow: PULs diversity in the Pacific Ocean’s sediment

Bernardo Chombo-Álvarez

2024-02-24