suppressPackageStartupMessages(library(dplyr))
suppressPackageStartupMessages(library(dendextend))
suppressPackageStartupMessages(library(tidyr))
suppressPackageStartupMessages(library(textshape))
suppressPackageStartupMessages(library(stringr))
suppressPackageStartupMessages(library(vegan))
suppressPackageStartupMessages(library(pracma))
suppressPackageStartupMessages(library(ggplot2))
suppressPackageStartupMessages(library(readxl))
session_info()
# ─ Session info ───────────────────────────────────────
# setting value
# version R version 4.3.2 (2023-10-31)
# os Manjaro Linux
# system x86_64, linux-gnu
# ui RStudio
# language (EN)
# collate en_US.UTF-8
# ctype en_US.UTF-8
# tz America/Mexico_City
# date 2024-02-21
# rstudio 2023.09.1+494 Desert Sunflower (desktop)
# pandoc 3.1.6 @ /usr/bin/ (via rmarkdown)
#
# ─ Packages ───────────────────────────────────────────
# package * version date (UTC) lib source
# ape 5.7-1 2023-03-13 [1] CRAN (R 4.3.2)
# cellranger 1.1.0 2016-07-27 [1] CRAN (R 4.3.2)
# circlize * 0.4.16 2024-02-20 [1] CRAN (R 4.3.2)
# cli 3.6.2 2023-12-11 [1] CRAN (R 4.3.2)
# cluster 2.1.4 2022-08-22 [2] CRAN (R 4.3.2)
# colorspace 2.1-0 2023-01-23 [1] CRAN (R 4.3.1)
# data.table 1.15.0 2024-01-30 [1] CRAN (R 4.3.2)
# dendextend * 1.17.1 2023-03-25 [1] CRAN (R 4.3.2)
# digest 0.6.34 2024-01-11 [1] CRAN (R 4.3.2)
# dplyr * 1.1.4 2023-11-17 [1] CRAN (R 4.3.2)
# evaluate 0.23 2023-11-01 [1] CRAN (R 4.3.2)
# factoextra 1.0.7 2020-04-01 [1] CRAN (R 4.3.2)
# fansi 1.0.6 2023-12-08 [1] CRAN (R 4.3.2)
# fastmap 1.1.1 2023-02-24 [1] CRAN (R 4.3.2)
# generics 0.1.3 2022-07-05 [1] CRAN (R 4.3.2)
# ggplot2 * 3.4.4 2023-10-12 [1] CRAN (R 4.3.2)
# ggrepel 0.9.5 2024-01-10 [1] CRAN (R 4.3.2)
# GlobalOptions 0.1.2 2020-06-10 [1] CRAN (R 4.3.2)
# glue 1.7.0 2024-01-09 [1] CRAN (R 4.3.2)
# gridExtra 2.3 2017-09-09 [1] CRAN (R 4.3.2)
# gtable 0.3.4 2023-08-21 [1] CRAN (R 4.3.1)
# htmltools 0.5.7 2023-11-03 [1] CRAN (R 4.3.2)
# knitr 1.45 2023-10-30 [1] CRAN (R 4.3.2)
# lattice * 0.21-9 2023-10-01 [2] CRAN (R 4.3.2)
# lifecycle 1.0.4 2023-11-07 [1] CRAN (R 4.3.2)
# magrittr 2.0.3 2022-03-30 [1] CRAN (R 4.3.1)
# MASS 7.3-60 2023-05-04 [2] CRAN (R 4.3.2)
# Matrix 1.6-1.1 2023-09-18 [2] CRAN (R 4.3.2)
# mgcv 1.9-0 2023-07-11 [2] CRAN (R 4.3.2)
# munsell 0.5.0 2018-06-12 [1] CRAN (R 4.3.1)
# nlme 3.1-163 2023-08-09 [2] CRAN (R 4.3.2)
# permute * 0.9-7 2022-01-27 [1] CRAN (R 4.3.2)
# pheatmap * 1.0.12 2019-01-04 [1] CRAN (R 4.3.2)
# pillar 1.9.0 2023-03-22 [1] CRAN (R 4.3.1)
# pkgconfig 2.0.3 2019-09-22 [1] CRAN (R 4.3.1)
# pracma * 2.4.4 2023-11-10 [1] CRAN (R 4.3.2)
# purrr 1.0.2 2023-08-10 [1] CRAN (R 4.3.2)
# R6 2.5.1 2021-08-19 [1] CRAN (R 4.3.1)
# RColorBrewer 1.1-3 2022-04-03 [1] CRAN (R 4.3.1)
# Rcpp 1.0.12 2024-01-09 [1] CRAN (R 4.3.2)
# readxl * 1.4.3 2023-07-06 [1] CRAN (R 4.3.2)
# rlang 1.1.3 2024-01-10 [1] CRAN (R 4.3.2)
# rmarkdown 2.25 2023-09-18 [1] CRAN (R 4.3.2)
# rsconnect 1.2.0 2023-12-15 [1] CRAN (R 4.3.2)
# rstudioapi 0.15.0 2023-07-07 [1] CRAN (R 4.3.2)
# scales 1.3.0 2023-11-28 [1] CRAN (R 4.3.2)
# sessioninfo * 1.2.2 2021-12-06 [1] CRAN (R 4.3.2)
# shape 1.4.6 2021-05-19 [1] CRAN (R 4.3.2)
# stringi 1.8.3 2023-12-11 [1] CRAN (R 4.3.2)
# stringr * 1.5.1 2023-11-14 [1] CRAN (R 4.3.2)
# textshape * 1.7.3 2021-05-28 [1] CRAN (R 4.3.2)
# tibble 3.2.1 2023-03-20 [1] CRAN (R 4.3.1)
# tidyr * 1.3.1 2024-01-24 [1] CRAN (R 4.3.2)
# tidyselect 1.2.0 2022-10-10 [1] CRAN (R 4.3.2)
# utf8 1.2.4 2023-10-22 [1] CRAN (R 4.3.1)
# vctrs 0.6.5 2023-12-01 [1] CRAN (R 4.3.2)
# vegan * 2.6-4 2022-10-11 [1] CRAN (R 4.3.2)
# viridis 0.6.5 2024-01-29 [1] CRAN (R 4.3.2)
# viridisLite 0.4.2 2023-05-02 [1] CRAN (R 4.3.1)
# withr 3.0.0 2024-01-16 [1] CRAN (R 4.3.2)
# xfun 0.41 2023-11-01 [1] CRAN (R 4.3.2)
# yaml 2.3.8 2023-12-11 [1] CRAN (R 4.3.2)
#
# [1] /home/chombo/R/x86_64-pc-linux-gnu-library/4.3
# [2] /usr/lib/R/library
BASH CLEANING
## First we cleaned the data from the 52514 MAGs from Nayfach project
## Here we extracted the taxonomic classification column and split it into different columns
## Then we merge the output with the complete Nayfach project metadata
paste <(cut -f1,15 AllMAGs-genome_metadata.tsv | tr ";" "\t" | awk '{ if (NR != 1) print }' | sed 's/[a-z]__//g' | awk -F "\t" '{ print $2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$8 }') <(awk -F "\t" '{if (NR != 1) print}' AllMAGs-genome_metadata.tsv | cut --complement -d$'\t' -f15) > MAGs_test.tsv
R CLEANING
## THIS IS IN THE '01_MetadataCleaning.R'
## Then we filtered the MAGs by the following parameters
## Completness >= 99
## Contamination <= 0.05
## Read the file
setwd(dir = '/home/chombo/Documents/SEGOVIA/CAZymes_PULs/data/')
colnames.vec <- c("division","phylum","class","order","family","specie","genome_id","metagenome_id","genome_length","num_contigs","n50","num_16s","num_5s","num_23s","num_trna","completeness","contamination","quality_score","mimag_quality","otu_id","ecosystem_category","ecosystem_type","habitat","longitude","latitude")
MAGS.df <- read.table('MAGs_test.tsv',sep = "\t",col.names = colnames.vec)
## Filter the file
## Completness >= 99%
## Contamination <= 0.05%
MAGS.filtered <- subset(MAGS.df,completeness >= 99 & contamination <= 0.05 & family != "")
MAGS.filtered <- MAGS.filtered %>%
select(genome_id,metagenome_id,everything())
MAGS.filtered <- MAGS.filtered[order(MAGS.filtered$genome_id),]
## Rename specific cases
MAGS.filtered$class[MAGS.filtered$class == "Bacilli_A"] <- "Bacilli"
MAGS.filtered$class[MAGS.filtered$class == "Thermoplasmata_A"] <- "Thermoplasmata"
MAGS.filtered$order[MAGS.filtered$order == "Bacillales_A"] <- "Bacillales"
## Save the filtered file
## Output: 1069 MAGs
# write.table(MAGS.filtered,file = "1069_MAGs_metadata.tsv",sep = "\t",row.names = FALSE)
## Filter diversity by ecosystem type, class, order and family
## Ecosystem
eco.div <- as.data.frame(table(MAGS.filtered$ecosystem_category))
colnames(eco.div) <- c("ecosystem","frequency")
## Class
class.div <- as.data.frame(table(MAGS.filtered$class))
colnames(class.div) <- c("class","frequency")
## Order
order.div <- as.data.frame(table(MAGS.filtered$order))
colnames(order.div) <- c("order","frequency")
## Family
fam.div <- as.data.frame(table(MAGS.filtered$family))
colnames(fam.div) <- c("family","frequency")
# write.table(MAGS.filtered$genome_id,file = "1069MAGS_list.txt",sep = "",row.names = FALSE,quote = FALSE,col.names = FALSE)
# write.table(eco.div$ecosystem,file = "1069Eco_list.txt",sep = "",row.names = FALSE,quote = FALSE,col.names = FALSE)
# write.table(class.div$class,file = "1069Class_list.txt",sep = "",row.names = FALSE,quote = FALSE,col.names = FALSE)
# write.table(order.div$order,file = "1069Order_list.txt",sep = "",row.names = FALSE,quote = FALSE,col.names = FALSE)
# write.table(fam.div$family,file = "1069Fam_list.txt",sep = "",row.names = FALSE,quote = FALSE,col.names = FALSE)
print(paste0("Total Filtered MAGs: ",length(unique(c(MAGS.filtered$genome_id)))))
## [1] "Total Filtered MAGs: 1069"
Here we are going to wotk with the 1069 filtered MAGs. DBcan4 had already been run and DIAMOND had also already been run for obtaining the PULs. These scripts are in the ‘/src/BASH/’ folder inside the repo.
## PULs Counts
## Import MAGs list
setwd(dir = '/home/chombo/Documents/SEGOVIA/CAZymes_PULs/data/')
MAGS.arr <- as.vector(read.table("1069MAGS_list.txt"))[[1]]
## Create a df with all the PUL counts according to each MAG
PULS.df <- data.frame()
for (mag in MAGS.arr) {
## Filter the 5 MAGs that do not have significant PULs associated with them
if (mag == "3300005326_35" || mag == "3300005370_1" || mag == "3300017650_9" || mag == "3300027284_65" || mag == "3300027607_50") {
next
}
df.temp <- read.table(paste(mag,"/",mag,".matches_evalue.out",sep = ""))
colnames(df.temp) <- c("PULs","evalue")
df.temp["MAG"] <- mag
## Save the PULs frequency
freq <- as.data.frame(table(df.temp$PULs))
freq <- freq[order(freq$Freq,decreasing = TRUE),]
write.table(freq,file = paste(mag,"/",mag,".matches_counts.out",sep = ""),sep = "\t",quote = FALSE,col.names = FALSE,row.names = FALSE)
## Store them in a single df
PULS.df <- rbind(PULS.df,df.temp)
}
print(paste0(length(PULS.df$PULs)," total PULs"))
## [1] "36172 total PULs"
print(paste0(length(unique(PULS.df$PULs))," different PULs"))
## [1] "1363 different PULs"
print(paste0(length(unique(PULS.df$MAG))," final MAGs"))
## [1] "1064 final MAGs"
## 36172 total PULs
## 1363 different PULs
## 1064 final MAGs
## Get the summary
PULSperMAG.df <- as.data.frame(table(PULS.df$MAG))
colnames(PULSperMAG.df) <- c("MAGs","PULs_Freq")
excludedMAGS <- data.frame(MAGs = c("3300005326_35","3300005370_1","3300017650_9","3300027284_65","3300027607_50"),
PULs_Freq = c(0,0,0,0,0))
PULSperMAG.df <- rbind(PULSperMAG.df,excludedMAGS)
## Get the PULs diversity
puls.diversity <- as.data.frame(table(PULS.df$PULs))
## Read the metadata and filter it
setwd(dir = '/home/chombo/Documents/SEGOVIA/CAZymes_PULs/data/')
MAGS.meta <- read.table(file = "1069_MAGs_metadata.tsv",sep = "\t",header = TRUE)
MAGS.meta <- MAGS.meta %>%
select(genome_id,class,order,family,ecosystem_category)
MAGS.meta$ecosystem_category <- gsub(" ","_",MAGS.meta$ecosystem_category)
## Remove the excluded MAGs
MAGS.meta <- MAGS.meta[!is.element(MAGS.meta$genome_id,c("3300005326_35","3300005370_1","3300017650_9","3300027284_65","3300027607_50")),]
## PULs per Ecosystem
eco.arr <- MAGS.meta %>%
distinct(ecosystem_category)
eco.arr <- eco.arr[order(eco.arr$ecosystem_category,decreasing = FALSE),]
full.div.df <- data.frame()
for (eco in eco.arr) {
## Extract all the MAGs annotated in each Ecosystem
mags.arr.temp <- as.vector(MAGS.meta$genome_id[MAGS.meta$ecosystem_category == eco])
## For each MAG, extract all the PUL counts
PULSperMAG.temp <- PULSperMAG.df[is.element(PULSperMAG.df$MAGs,mags.arr.temp),]
## Extract the complete diversity
eco.div.df <- data.frame(ecosystem = c(eco),
frequency = sum(PULSperMAG.temp$PULs_Freq))
full.div.df <- rbind(full.div.df,eco.div.df)
## Extract the PULs diversity
rawPULs.temp <- PULS.df[is.element(PULS.df$MAG,mags.arr.temp),]
pulscounts.temp <- as.data.frame(table(rawPULs.temp$PULs))
colnames(pulscounts.temp) <- c("PULs","frequency")
pulscounts.temp <- pulscounts.temp[order(pulscounts.temp$frequency,decreasing = TRUE),]
}
head(full.div.df,24L)
## ecosystem frequency
## 1 Algae 17
## 2 Animal 599
## 3 Annelida 33
## 4 Aquatic 6186
## 5 Arthropoda 344
## 6 Bioreactor 856
## 7 Bioremediation 107
## 8 Biotransformation 38
## 9 Birds 155
## 10 Built_environment 5285
## 11 Cnidaria 25
## 12 Fungi 75
## 13 Human 16581
## 14 Insecta 390
## 15 Lab_enrichment 892
## 16 Lab_synthesis 119
## 17 Mammals 782
## 18 Microbial 109
## 19 Modeled 29
## 20 Plants 422
## 21 Solid_waste 247
## 22 Terrestrial 2134
## 23 Wastewater 747
print(paste0(sum(full.div.df$frequency)," total PULs"))
## [1] "36172 total PULs"
## Read the metadata and filter it
setwd(dir = '/home/chombo/Documents/SEGOVIA/CAZymes_PULs/data/')
## PULs per Class
class.arr <- MAGS.meta %>%
distinct(class)
class.arr <- class.arr[order(class.arr$class,decreasing = FALSE),]
full.div.df <- data.frame()
for (class in class.arr) {
## Extract all the MAGs annotated in each Class
mags.arr.temp <- as.vector(MAGS.meta$genome_id[MAGS.meta$class == class])
## For each MAG, extract all the PUL counts
PULSperMAG.temp <- PULSperMAG.df[is.element(PULSperMAG.df$MAGs,mags.arr.temp),]
## Extract the complete diversity
class.div.df <- data.frame(class = c(class),
frequency = sum(PULSperMAG.temp$PULs_Freq))
full.div.df <- rbind(full.div.df,class.div.df)
## Extract the PULs diversity
rawPULs.temp <- PULS.df[is.element(PULS.df$MAG,mags.arr.temp),]
pulscounts.temp <- as.data.frame(table(rawPULs.temp$PULs))
colnames(pulscounts.temp) <- c("PULs","frequency")
pulscounts.temp <- pulscounts.temp[order(pulscounts.temp$frequency,decreasing = TRUE),]
}
head(full.div.df,39L)
## class frequency
## 1 AC1 74
## 2 Acidobacteriae 279
## 3 Actinobacteria 1867
## 4 Alphaproteobacteria 1477
## 5 Archaeoglobi 165
## 6 Bacilli 4020
## 7 Bacteroidia 7222
## 8 Bdellovibrionia 38
## 9 Campylobacteria 102
## 10 Chloroflexia 21
## 11 Cloacimonadia 48
## 12 Clostridia 3347
## 13 Coprothermobacteria 96
## 14 Coriobacteriia 651
## 15 Cyanobacteriia 111
## 16 Dehalococcoidia 5
## 17 Deinococci 2498
## 18 Desulfitobacteriia 79
## 19 Desulfobacteria 258
## 20 Desulfovibrionia 421
## 21 Dictyoglomia 69
## 22 Fusobacteriia 164
## 23 Gammaproteobacteria 6597
## 24 Halobacteria 70
## 25 Methanobacteria 284
## 26 Methanomicrobia 93
## 27 Methanosarcinia 311
## 28 Negativicutes 3138
## 29 Nitrososphaeria 155
## 30 Rhodothermia 116
## 31 Spirochaetia 648
## 32 Synergistia 438
## 33 Thermodesulfovibrionia 366
## 34 Thermoplasmata 103
## 35 Thermoprotei 211
## 36 Thermotogae 180
## 37 UBA5184 51
## 38 Verrucomicrobiae 399
print(paste0(sum(full.div.df$frequency)," total PULs"))
## [1] "36172 total PULs"
## Read the metadata and filter it
setwd(dir = '/home/chombo/Documents/SEGOVIA/CAZymes_PULs/data/')
## PULs per Order
order.arr <- MAGS.meta %>%
distinct(order)
order.arr <- order.arr[order(order.arr$order,decreasing = FALSE),]
full.div.df <- data.frame()
for (ord in order.arr) {
## Extract all the MAGs annotated in each Order
mags.arr.temp <- as.vector(MAGS.meta$genome_id[MAGS.meta$order == ord])
## For each MAG, extract all the PUL counts
PULSperMAG.temp <- PULSperMAG.df[is.element(PULSperMAG.df$MAGs,mags.arr.temp),]
## Extract the complete diversity
order.div.df <- data.frame(order = c(ord),
frequency = sum(PULSperMAG.temp$PULs_Freq))
full.div.df <- rbind(full.div.df,order.div.df)
## Extract the PULs diversity
rawPULs.temp <- PULS.df[is.element(PULS.df$MAG,mags.arr.temp),]
pulscounts.temp <- as.data.frame(table(rawPULs.temp$PULs))
colnames(pulscounts.temp) <- c("PULs","frequency")
pulscounts.temp <- pulscounts.temp[order(pulscounts.temp$frequency,decreasing = TRUE),]
}
head(full.div.df,81L)
## order frequency
## 1 AC1 74
## 2 Acetivibrionales 68
## 3 Acetobacterales 48
## 4 Acidaminococcales 1343
## 5 Acidiprofundales 38
## 6 Acidobacteriales 279
## 7 Actinomycetales 1110
## 8 Archaeoglobales 165
## 9 Azospirillales 74
## 10 Bacillales 81
## 11 Bacteroidales 2771
## 12 Balneolales 59
## 13 Bdellovibrionales 38
## 14 Betaproteobacteriales 2553
## 15 Brevibacillales 50
## 16 Caldiarchaeales 32
## 17 Campylobacterales 102
## 18 Caulobacterales 439
## 19 Chitinophagales 571
## 20 Christensenellales 59
## 21 Cloacimonadales 48
## 22 Coprothermobacterales 96
## 23 Coriobacteriales 651
## 24 Corynebacteriales 379
## 25 Cyanobacteriales 111
## 26 Cytophagales 231
## 27 Dehalococcoidales 5
## 28 Deinococcales 2498
## 29 Desulfitobacteriales 79
## 30 Desulfobacterales 258
## 31 Desulfovibrionales 421
## 32 Dictyoglomales 69
## 33 Enterobacterales 1536
## 34 Erysipelotrichales 1887
## 35 Exiguobacterales 210
## 36 Flavobacteriales 3195
## 37 Fusobacteriales 164
## 38 Halobacteriales 70
## 39 Haloplasmatales 35
## 40 Lachnospirales 2316
## 41 Lactobacillales 1501
## 42 Methanobacteriales 284
## 43 Methanomassiliicoccales 9
## 44 Methanomicrobiales 93
## 45 Methanosarcinales 204
## 46 Methanotrichales 107
## 47 ML615J-28 17
## 48 Mycoplasmatales 35
## 49 Nitrososphaerales 123
## 50 NS11-12g 103
## 51 Oscillospirales 708
## 52 Paenibacillales 33
## 53 Peptostreptococcales 146
## 54 Propionibacteriales 378
## 55 Pseudomonadales 2023
## 56 RFN20 2
## 57 Rhizobiales 477
## 58 Rhodobacterales 154
## 59 Rhodospirillales 92
## 60 Rhodothermales 57
## 61 Selenomonadales 156
## 62 Sneathiellales 37
## 63 Sphaerochaetales 7
## 64 Sphingobacteriales 351
## 65 Sphingomonadales 69
## 66 Spirochaetales 33
## 67 Staphylococcales 169
## 68 Sulfolobales 211
## 69 Synergistales 438
## 70 Thermodesulfovibrionales 366
## 71 Thermomicrobiales 21
## 72 Thermoplasmatales 56
## 73 Thermotogales 180
## 74 Tissierellales 50
## 75 Treponematales 608
## 76 UBA5184 51
## 77 UBA998 87
## 78 Veillonellales 1639
## 79 Verrucomicrobiales 399
## 80 Xanthomonadales 485
print(paste0(sum(full.div.df$frequency)," total PULs"))
## [1] "36172 total PULs"
## Read the metadata and filter it
setwd(dir = '/home/chombo/Documents/SEGOVIA/CAZymes_PULs/data/')
## PULs per Family
fam.arr <- MAGS.meta %>%
distinct(family)
fam.arr <- fam.arr[order(fam.arr$family,decreasing = FALSE),]
full.div.df <- data.frame()
for (fam in fam.arr) {
## Extract all the MAGs annotated in each Family
mags.arr.temp <- as.vector(MAGS.meta$genome_id[MAGS.meta$family == fam])
## For each MAG, extract all the PUL counts
PULSperMAG.temp <- PULSperMAG.df[is.element(PULSperMAG.df$MAGs,mags.arr.temp),]
## Extract the complete diversity
fam.div.df <- data.frame(family = c(fam),
frequency = sum(PULSperMAG.temp$PULs_Freq))
full.div.df <- rbind(full.div.df,fam.div.df)
## Extract the PULs diversity
rawPULs.temp <- PULS.df[is.element(PULS.df$MAG,mags.arr.temp),]
pulscounts.temp <- as.data.frame(table(rawPULs.temp$PULs))
colnames(pulscounts.temp) <- c("PULs","frequency")
pulscounts.temp <- pulscounts.temp[order(pulscounts.temp$frequency,decreasing = TRUE),]
}
head(full.div.df,139L)
## family frequency
## 1 1G12 25
## 2 Acetivibrionaceae 68
## 3 Acetobacteraceae 48
## 4 Acidaminococcaceae 1343
## 5 Acidobacteriaceae 26
## 6 Aerococcaceae 46
## 7 Akkermansiaceae 156
## 8 Alcanivoracaceae 158
## 9 Aminobacteriaceae 41
## 10 Anaerovoracaceae 73
## 11 Archaeoglobaceae 165
## 12 Atopobiaceae 123
## 13 Bacteroidaceae 301
## 14 Balneolaceae 59
## 15 Barnesiellaceae 884
## 16 Bdellovibrionaceae 38
## 17 Beijerinckiaceae 156
## 18 Bifidobacteriaceae 429
## 19 Brevibacillaceae 50
## 20 Burkholderiaceae 2424
## 21 CAG-138 59
## 22 CAG-698 17
## 23 CAG-826 2
## 24 Caldibacillaceae 64
## 25 Campylobacteraceae 72
## 26 Caulobacteraceae 439
## 27 Cellulomonadaceae 367
## 28 Chitinophagaceae 571
## 29 Cloacimonadaceae 48
## 30 Coprobacteraceae 8
## 31 Coprothermobacteraceae 96
## 32 Coriobacteriaceae 297
## 33 Corynebacteriaceae 379
## 34 Crocinitomicaceae 47
## 35 Cryomorphaceae 246
## 36 Cyanobacteriaceae 51
## 37 Cyclobacteriaceae 166
## 38 Dehalococcoidaceae 5
## 39 Dermatophilaceae 102
## 40 Desulfobacteraceae 258
## 41 Desulfomicrobiaceae 22
## 42 Desulfovibrionaceae 399
## 43 Dethiosulfovibrionaceae 40
## 44 Devosiaceae 169
## 45 Dictyoglomaceae 69
## 46 DTU089 125
## 47 Dysgonomonadaceae 59
## 48 Eggerthellaceae 231
## 49 Enterobacteriaceae 1402
## 50 Enterococcaceae 719
## 51 Erysipelatoclostridiaceae 1125
## 52 Erysipelotrichaceae 762
## 53 EtOH8 74
## 54 Exiguobacteraceae 210
## 55 Ferrovibrionaceae 37
## 56 Fervidobacteriaceae 126
## 57 Flavobacteriaceae 442
## 58 Fusobacteriaceae 164
## 59 Hahellaceae 79
## 60 Haloferacaceae 70
## 61 Helicobacteraceae 30
## 62 Inquilinaceae 74
## 63 JdFR-45 38
## 64 JGI-0000106-J15 32
## 65 koll-22 56
## 66 Koribacteraceae 253
## 67 Lachnospiraceae 2316
## 68 Lactobacillaceae 619
## 69 Magnetospirillaceae 51
## 70 Marinifilaceae 283
## 71 Marinomonadaceae 60
## 72 Megasphaeraceae 969
## 73 Methanobacteriaceae 280
## 74 Methanomassiliicoccaceae 3
## 75 Methanomethylophilaceae 6
## 76 Methanomicrobiaceae 29
## 77 Methanoregulaceae 64
## 78 Methanosarcinaceae 204
## 79 Methanothermobacteraceae 4
## 80 Methanotrichaceae 107
## 81 Methylophilaceae 21
## 82 Microbacteriaceae 139
## 83 Micrococcaceae 73
## 84 Moraxellaceae 1600
## 85 Mycoplasmataceae 30
## 86 Mycoplasmataceae_B 5
## 87 Neisseriaceae 95
## 88 Nitrosopumilaceae 84
## 89 Oscillospiraceae 81
## 90 Paenibacillaceae 33
## 91 Paludibacteraceae 937
## 92 Pasteurellaceae 134
## 93 Peptoniphilaceae 50
## 94 Peptostreptococcaceae 73
## 95 Phormidiaceae 60
## 96 Planococcaceae 17
## 97 Porphyromonadaceae 43
## 98 Porticoccaceae 5
## 99 Propionibacteriaceae 378
## 100 Pseudomonadaceae 121
## 101 Rhizobiaceae 152
## 102 Rhodobacteraceae 154
## 103 Rhodocyclaceae 13
## 104 Rhodothermaceae 57
## 105 Rikenellaceae 231
## 106 Ruminococcaceae 502
## 107 Selenomonadaceae 156
## 108 Sphaerochaetaceae 7
## 109 Sphingobacteriaceae 351
## 110 Sphingomonadaceae 69
## 111 Spirochaetaceae_B 33
## 112 Spirosomaceae 65
## 113 Staphylococcaceae 169
## 114 Streptococcaceae 117
## 115 Sulfolobaceae 211
## 116 Synergistaceae 130
## 117 Syntrophobotulaceae 79
## 118 Tannerellaceae 25
## 119 Thalassospiraceae 41
## 120 Thermaceae 2498
## 121 Thermodesulfovibrionaceae 366
## 122 Thermomicrobiaceae 21
## 123 Thermoplasmataceae 56
## 124 Thermotogaceae 54
## 125 Thermovirgaceae 227
## 126 Treponemataceae 608
## 127 Turicibacteraceae 35
## 128 UA16 119
## 129 UBA213 39
## 130 UBA3002 87
## 131 UBA5184 51
## 132 UBA7430 58
## 133 UBA955 55
## 134 UKL13-3 48
## 135 Veillonellaceae 670
## 136 Verrucomicrobiaceae 243
## 137 Weeksellaceae 2202
## 138 Xanthomonadaceae 485
print(paste0(sum(full.div.df$frequency)," total PULs"))
## [1] "36172 total PULs"
## CAZYMES COUNTS
## Filter the DBcan4 output
## e-value <= 1e-18
## coverage >= 0.8
setwd(dir = '/home/chombo/Documents/SEGOVIA/CAZymes_PULs/data/')
## Create a df with all the CAZy counts according to each MAG
CAZYS.df <- data.frame()
for (mag in MAGS.arr) {
## Filter the 5 MAGs that do not have significant CAZys associated with them
if (mag == "3300007500_43") {
next
}
## Read the HMMER output for those matches with an evalue <= 1e-18 and a coverage >= 0.8
df.temp <- read.table(file = paste(mag,"/output_dbcan4/hmmer.out",sep = ""),sep = "\t",skip = 1)
colnames(df.temp) <- c("CAZy","length","geneID","genelength","evalue","start","end","genestart","geneend","coverage")
df.temp <- df.temp %>%
filter(evalue <= 1e-18,coverage >= 0.8) %>%
select(CAZy,geneID,evalue,coverage)
## Extract the gene ID
ids <- as.vector(df.temp$geneID)
## Read the overview.txt file to compare it with the hmmer.txt file
df.temp <- read.table(paste(mag,"/output_dbcan4/overview.txt",sep = ""),header = FALSE,skip = 1)
colnames(df.temp) <- c("geneID","EC","HMMER","dbCAN_sub","DIAMOND","Signalp","Tools")
df.temp["MAG"] <- mag
df.temp <- df.temp[is.element(df.temp$geneID,ids),]
df.temp <- df.temp %>%
filter(Tools >= 2) %>%
select(MAG,HMMER,Signalp)
df.temp$HMMER <- gsub("\\(.*?\\)", "", df.temp$HMMER)
df.temp$Signalp <- gsub("\\(.*\\)", "",df.temp$Signalp)
df.temp <- df.temp[df.temp$HMMER != "-",]
## Save the CAZYs frequency
freq <- as.data.frame(table(df.temp$HMMER))
freq <- freq[order(freq$Freq,decreasing = TRUE),]
# write.table(freq,file = paste(mag,"/output_dbcan4/",mag,".CAZyCounts.txt",sep = ""),sep = "\t",quote = FALSE,col.names = FALSE,row.names = FALSE)
## Clean the CAZys and do another df
df.temp$HMMER <- gsub("\\+.*", "", df.temp$HMMER)
## Save the CAZYs frequency
freq <- as.data.frame(table(df.temp$HMMER))
# freq <- freq[order(freq$Freq,decreasing = TRUE),]
# write.table(freq,file = paste(mag,"/output_dbcan4/",mag,".CAZyCounts.single.txt",sep = ""),sep = "\t",quote = FALSE,col.names = FALSE,row.names = FALSE)
if (length(df.temp) == 0) {
print(mag)
}
## Store them in a single df
CAZYS.df <- rbind(CAZYS.df,df.temp)
}
colnames(CAZYS.df) <- c("MAG","CAZys","Signalp")
print(paste0(length(CAZYS.df$CAZys)," total CAZys"))
## [1] "61654 total CAZys"
print(paste0(length(unique(CAZYS.df$CAZys))," different CAZys"))
## [1] "424 different CAZys"
print(paste0(length(unique(CAZYS.df$MAG))," final MAGs"))
## [1] "1064 final MAGs"
## 61654 total CAZys
## 424 different CAZys
## 1064 final MAGs
## Get the summary
CAZysperMAG.df <- as.data.frame(table(CAZYS.df$MAG))
colnames(CAZysperMAG.df) <- c("MAGs","CAZys_Freq")
excludedMAGS <- data.frame(MAGs = c("3300007500_43","3300011404_8","3300012067_9","3300012151_17","3300012165_16"),
CAZys_Freq = c(0,0,0,0,0))
CAZysperMAG.df <- rbind(CAZysperMAG.df,excludedMAGS)
## Search for MAGs without CAZys
# setdiff(MAGS.arr,c(CAZysperMAG.df$MAGs))
# [1] "3300011404_8" "3300012067_9" "3300012151_17"
# [4] "3300012165_16"
## Read the metadata and filter it
setwd(dir = '/home/chombo/Documents/SEGOVIA/CAZymes_PULs/data/')
## Read the MAGs metadata
MAGS.meta <- read.table(file = "1069_MAGs_metadata.tsv",sep = "\t",header = TRUE)
MAGS.meta <- MAGS.meta %>%
select(genome_id,class,order,family,ecosystem_category)
MAGS.meta$ecosystem_category <- gsub(" ","_",MAGS.meta$ecosystem_category)
## Remove the excluded MAGs
MAGS.meta <- MAGS.meta[!is.element(MAGS.meta$genome_id,c("3300007500_43","3300011404_8","3300012067_9","3300012151_17","3300012165_16")),]
## CAZys per Ecosystem
eco.arr <- MAGS.meta %>%
distinct(ecosystem_category)
eco.arr <- eco.arr[order(eco.arr$ecosystem_category,decreasing = FALSE),]
full.div.df <- data.frame()
for (eco in eco.arr) {
## Extract all the MAGs annotated in each Class
mags.arr.temp <- as.vector(MAGS.meta$genome_id[MAGS.meta$ecosystem_category == eco])
## For each MAG, extract all the PUL counts
CAZYsperMAG.temp <- CAZysperMAG.df[is.element(CAZysperMAG.df$MAGs,mags.arr.temp),]
## Extract the complete diversity
eco.div.df <- data.frame(order = c(eco),
frequency = sum(CAZYsperMAG.temp$CAZys_Freq))
full.div.df <- rbind(full.div.df,eco.div.df)
## Extract the PULs diversity
rawCAZys.temp <- CAZYS.df[is.element(CAZYS.df$MAG,mags.arr.temp),]
cazyscounts.temp <- as.data.frame(table(rawCAZys.temp$CAZys))
colnames(cazyscounts.temp) <- c("CAZys","frequency")
cazyscounts.temp <- cazyscounts.temp[order(cazyscounts.temp$frequency,decreasing = TRUE),]
}
head(full.div.df,139L)
## order frequency
## 1 Algae 50
## 2 Animal 913
## 3 Annelida 23
## 4 Aquatic 5705
## 5 Arthropoda 1948
## 6 Bioreactor 773
## 7 Bioremediation 132
## 8 Biotransformation 43
## 9 Birds 676
## 10 Built_environment 9614
## 11 Cnidaria 67
## 12 Fungi 144
## 13 Human 28160
## 14 Insecta 1100
## 15 Lab_enrichment 2029
## 16 Lab_synthesis 57
## 17 Mammals 1757
## 18 Microbial 130
## 19 Modeled 58
## 20 Plants 760
## 21 Solid_waste 227
## 22 Terrestrial 5814
## 23 Wastewater 1474
print(paste0(sum(full.div.df$frequency)," total CAZymes"))
## [1] "61654 total CAZymes"
## Read the metadata and filter it
setwd(dir = '/home/chombo/Documents/SEGOVIA/CAZymes_PULs/data/')
## CAZys per Class
class.arr <- MAGS.meta %>%
distinct(class)
class.arr <- class.arr[order(class.arr$class,decreasing = FALSE),]
full.div.df <- data.frame()
for (class in class.arr) {
## Extract all the MAGs annotated in each Class
mags.arr.temp <- as.vector(MAGS.meta$genome_id[MAGS.meta$class == class])
## For each MAG, extract all the PUL counts
CAZYsperMAG.temp <- CAZysperMAG.df[is.element(CAZysperMAG.df$MAGs,mags.arr.temp),]
## Extract the complete diversity
class.div.df <- data.frame(class = c(class),
frequency = sum(CAZYsperMAG.temp$CAZys_Freq))
full.div.df <- rbind(full.div.df,class.div.df)
## Extract the PULs diversity
rawCAZys.temp <- CAZYS.df[is.element(CAZYS.df$MAG,mags.arr.temp),]
cazyscounts.temp <- as.data.frame(table(rawCAZys.temp$CAZys))
colnames(cazyscounts.temp) <- c("CAZys","frequency")
cazyscounts.temp <- cazyscounts.temp[order(cazyscounts.temp$frequency,decreasing = TRUE),]
}
head(full.div.df,139L)
## class frequency
## 1 AC1 186
## 2 Acidobacteriae 198
## 3 Actinobacteria 6152
## 4 Alphaproteobacteria 2219
## 5 Archaeoglobi 26
## 6 Bacilli 6473
## 7 Bacteroidia 23498
## 8 Bdellovibrionia 43
## 9 Campylobacteria 107
## 10 Chloroflexia 76
## 11 Cloacimonadia 24
## 12 Clostridia 5845
## 13 Coprothermobacteria 52
## 14 Coriobacteriia 583
## 15 Cyanobacteriia 127
## 16 Dehalococcoidia 18
## 17 Deinococci 1713
## 18 Desulfitobacteriia 21
## 19 Desulfobacteria 180
## 20 Desulfovibrionia 378
## 21 Dictyoglomia 157
## 22 Fusobacteriia 97
## 23 Gammaproteobacteria 8501
## 24 Halobacteria 26
## 25 Methanobacteria 179
## 26 Methanomicrobia 28
## 27 Methanosarcinia 131
## 28 Negativicutes 2442
## 29 Nitrososphaeria 30
## 30 Rhodothermia 620
## 31 Spirochaetia 467
## 32 Synergistia 270
## 33 Thermodesulfovibrionia 287
## 34 Thermoplasmata 69
## 35 Thermoprotei 93
## 36 Thermotogae 97
## 37 UBA5184 52
## 38 Verrucomicrobiae 189
print(paste0(sum(full.div.df$frequency)," total CAZymes"))
## [1] "61654 total CAZymes"
## Read the metadata and filter it
setwd(dir = '/home/chombo/Documents/SEGOVIA/CAZymes_PULs/data/')
## CAZys per Order
order.arr <- MAGS.meta %>%
distinct(order)
order.arr <- order.arr[order(order.arr$order,decreasing = FALSE),]
full.div.df <- data.frame()
for (ord in order.arr) {
## Corresponds to the MAG without counts
if (ord == "Mycoplasmatales") {
next
}
## Extract all the MAGs annotated in each Class
mags.arr.temp <- as.vector(MAGS.meta$genome_id[MAGS.meta$order == ord])
## For each MAG, extract all the CAZy counts
CAZYsperMAG.temp <- CAZysperMAG.df[is.element(CAZysperMAG.df$MAGs,mags.arr.temp),]
## Extract the complete diversity
order.div.df <- data.frame(order = c(ord),
frequency = sum(CAZYsperMAG.temp$CAZys_Freq))
full.div.df <- rbind(full.div.df,order.div.df)
## Extract the CAZys diversity
rawCAZys.temp <- CAZYS.df[is.element(CAZYS.df$MAG,mags.arr.temp),]
cazyscounts.temp <- as.data.frame(table(rawCAZys.temp$CAZys))
colnames(cazyscounts.temp) <- c("CAZys","frequency")
cazyscounts.temp <- cazyscounts.temp[order(cazyscounts.temp$frequency,decreasing = TRUE),]
}
head(full.div.df,139L)
## order frequency
## 1 AC1 186
## 2 Acetivibrionales 102
## 3 Acetobacterales 97
## 4 Acidaminococcales 705
## 5 Acidiprofundales 7
## 6 Acidobacteriales 198
## 7 Actinomycetales 4931
## 8 Archaeoglobales 26
## 9 Azospirillales 58
## 10 Bacillales 98
## 11 Bacteroidales 12264
## 12 Balneolales 109
## 13 Bdellovibrionales 43
## 14 Betaproteobacteriales 2189
## 15 Brevibacillales 56
## 16 Caldiarchaeales 5
## 17 Campylobacterales 107
## 18 Caulobacterales 847
## 19 Chitinophagales 2875
## 20 Christensenellales 43
## 21 Cloacimonadales 24
## 22 Coprothermobacterales 52
## 23 Coriobacteriales 583
## 24 Corynebacteriales 557
## 25 Cyanobacteriales 127
## 26 Cytophagales 816
## 27 Dehalococcoidales 18
## 28 Deinococcales 1713
## 29 Desulfitobacteriales 21
## 30 Desulfobacterales 180
## 31 Desulfovibrionales 378
## 32 Dictyoglomales 157
## 33 Enterobacterales 2910
## 34 Erysipelotrichales 2358
## 35 Exiguobacterales 542
## 36 Flavobacteriales 6209
## 37 Fusobacteriales 97
## 38 Halobacteriales 26
## 39 Haloplasmatales 88
## 40 Lachnospirales 4190
## 41 Lactobacillales 2848
## 42 Methanobacteriales 179
## 43 Methanomassiliicoccales 45
## 44 Methanomicrobiales 28
## 45 Methanosarcinales 96
## 46 Methanotrichales 35
## 47 ML615J-28 23
## 48 Nitrososphaerales 25
## 49 NS11-12g 99
## 50 Oscillospirales 1397
## 51 Paenibacillales 214
## 52 Peptostreptococcales 103
## 53 Propionibacteriales 664
## 54 Pseudomonadales 2151
## 55 RFN20 46
## 56 Rhizobiales 717
## 57 Rhodobacterales 116
## 58 Rhodospirillales 167
## 59 Rhodothermales 511
## 60 Selenomonadales 284
## 61 Sneathiellales 33
## 62 Sphaerochaetales 60
## 63 Sphingobacteriales 1235
## 64 Sphingomonadales 157
## 65 Spirochaetales 23
## 66 Staphylococcales 200
## 67 Sulfolobales 93
## 68 Synergistales 270
## 69 Thermodesulfovibrionales 287
## 70 Thermomicrobiales 76
## 71 Thermoplasmatales 17
## 72 Thermotogales 97
## 73 Tissierellales 10
## 74 Treponematales 384
## 75 UBA5184 52
## 76 UBA998 27
## 77 Veillonellales 1453
## 78 Verrucomicrobiales 189
## 79 Xanthomonadales 1251
print(paste0(sum(full.div.df$frequency)," total CAZymes"))
## [1] "61654 total CAZymes"
## Read the metadata and filter it
setwd(dir = '/home/chombo/Documents/SEGOVIA/CAZymes_PULs/data/')
## CAZys per Family
fam.arr <- MAGS.meta %>%
distinct(family)
fam.arr <- fam.arr[order(fam.arr$family,decreasing = FALSE),]
full.div.df <- data.frame()
for (fam in fam.arr) {
## Extract all the MAGs annotated in each Class
mags.arr.temp <- as.vector(MAGS.meta$genome_id[MAGS.meta$family == fam])
## For each MAG, extract all the PUL counts
CAZYsperMAG.temp <- CAZysperMAG.df[is.element(CAZysperMAG.df$MAGs,mags.arr.temp),]
## Extract the complete diversity
family.div.df <- data.frame(order = c(fam),
frequency = sum(CAZYsperMAG.temp$CAZys_Freq))
full.div.df <- rbind(full.div.df,family.div.df)
## Extract the PULs diversity
rawCAZys.temp <- CAZYS.df[is.element(CAZYS.df$MAG,mags.arr.temp),]
cazyscounts.temp <- as.data.frame(table(rawCAZys.temp$CAZys))
colnames(cazyscounts.temp) <- c("CAZys","frequency")
cazyscounts.temp <- cazyscounts.temp[order(cazyscounts.temp$frequency,decreasing = TRUE),]
}
head(full.div.df,139L)
## order frequency
## 1 1G12 67
## 2 Acetivibrionaceae 102
## 3 Acetobacteraceae 97
## 4 Acidaminococcaceae 705
## 5 Acidobacteriaceae 112
## 6 Aerococcaceae 33
## 7 Akkermansiaceae 83
## 8 Alcanivoracaceae 178
## 9 Aminobacteriaceae 40
## 10 Anaerovoracaceae 56
## 11 Archaeoglobaceae 26
## 12 Atopobiaceae 141
## 13 Bacteroidaceae 2208
## 14 Balneolaceae 109
## 15 Barnesiellaceae 6664
## 16 Bdellovibrionaceae 43
## 17 Beijerinckiaceae 246
## 18 Bifidobacteriaceae 1897
## 19 Brevibacillaceae 56
## 20 Burkholderiaceae 1887
## 21 CAG-138 43
## 22 CAG-698 23
## 23 CAG-826 46
## 24 Caldibacillaceae 48
## 25 Campylobacteraceae 85
## 26 Caulobacteraceae 847
## 27 Cellulomonadaceae 2565
## 28 Chitinophagaceae 2875
## 29 Cloacimonadaceae 24
## 30 Coprobacteraceae 156
## 31 Coprothermobacteraceae 52
## 32 Coriobacteriaceae 275
## 33 Corynebacteriaceae 557
## 34 Crocinitomicaceae 94
## 35 Cryomorphaceae 269
## 36 Cyanobacteriaceae 51
## 37 Cyclobacteriaceae 622
## 38 Dehalococcoidaceae 18
## 39 Dermatophilaceae 163
## 40 Desulfobacteraceae 180
## 41 Desulfomicrobiaceae 37
## 42 Desulfovibrionaceae 341
## 43 Dethiosulfovibrionaceae 21
## 44 Devosiaceae 294
## 45 Dictyoglomaceae 157
## 46 DTU089 288
## 47 Dysgonomonadaceae 506
## 48 Eggerthellaceae 167
## 49 Enterobacteriaceae 2726
## 50 Enterococcaceae 1550
## 51 Erysipelatoclostridiaceae 1546
## 52 Erysipelotrichaceae 812
## 53 EtOH8 186
## 54 Exiguobacteraceae 542
## 55 Ferrovibrionaceae 33
## 56 Fervidobacteriaceae 60
## 57 Flavobacteriaceae 2142
## 58 Fusobacteriaceae 97
## 59 Hahellaceae 157
## 60 Haloferacaceae 26
## 61 Helicobacteraceae 22
## 62 Inquilinaceae 58
## 63 JdFR-45 7
## 64 JGI-0000106-J15 5
## 65 koll-22 149
## 66 Koribacteraceae 86
## 67 Lachnospiraceae 4190
## 68 Lactobacillaceae 966
## 69 Magnetospirillaceae 45
## 70 Marinifilaceae 193
## 71 Marinomonadaceae 100
## 72 Megasphaeraceae 817
## 73 Methanobacteriaceae 161
## 74 Methanomassiliicoccaceae 35
## 75 Methanomethylophilaceae 10
## 76 Methanomicrobiaceae 20
## 77 Methanoregulaceae 8
## 78 Methanosarcinaceae 96
## 79 Methanothermobacteraceae 18
## 80 Methanotrichaceae 35
## 81 Methylophilaceae 126
## 82 Microbacteriaceae 207
## 83 Micrococcaceae 99
## 84 Moraxellaceae 1500
## 85 Neisseriaceae 144
## 86 Nitrosopumilaceae 17
## 87 Oscillospiraceae 75
## 88 Paenibacillaceae 214
## 89 Paludibacteraceae 1657
## 90 Pasteurellaceae 184
## 91 Peptoniphilaceae 10
## 92 Peptostreptococcaceae 47
## 93 Phormidiaceae 76
## 94 Planococcaceae 50
## 95 Porphyromonadaceae 67
## 96 Porticoccaceae 20
## 97 Propionibacteriaceae 664
## 98 Pseudomonadaceae 196
## 99 Rhizobiaceae 177
## 100 Rhodobacteraceae 116
## 101 Rhodocyclaceae 32
## 102 Rhodothermaceae 511
## 103 Rikenellaceae 506
## 104 Ruminococcaceae 1034
## 105 Selenomonadaceae 284
## 106 Sphaerochaetaceae 60
## 107 Sphingobacteriaceae 1235
## 108 Sphingomonadaceae 157
## 109 Spirochaetaceae_B 23
## 110 Spirosomaceae 194
## 111 Staphylococcaceae 200
## 112 Streptococcaceae 299
## 113 Sulfolobaceae 93
## 114 Synergistaceae 100
## 115 Syntrophobotulaceae 21
## 116 Tannerellaceae 307
## 117 Thalassospiraceae 122
## 118 Thermaceae 1713
## 119 Thermodesulfovibrionaceae 287
## 120 Thermomicrobiaceae 76
## 121 Thermoplasmataceae 17
## 122 Thermotogaceae 37
## 123 Thermovirgaceae 109
## 124 Treponemataceae 384
## 125 Turicibacteraceae 88
## 126 UA16 53
## 127 UBA213 8
## 128 UBA3002 27
## 129 UBA5184 52
## 130 UBA7430 27
## 131 UBA955 33
## 132 UKL13-3 66
## 133 Veillonellaceae 636
## 134 Verrucomicrobiaceae 106
## 135 Weeksellaceae 3408
## 136 Xanthomonadaceae 1251
print(paste0(sum(full.div.df$frequency)," total CAZymes"))
## [1] "61654 total CAZymes"
The approach was create a data frame with the PULs diversity per MAG
(IMPORTANT: diversity is DIFFERENT from frequency) because we wanted to
see which MAGs had the most variety of PULs. This was also made with the
CAZymes and first. Doing the filtering, we obtained a reduced matrix and
obtained their distances with the vegdist(mtx,"bray")
function from the vegan package. Finally, we plot the
PCoA by their Ecosystem, Class, Order and Family for PULs and CAZymes.
PULs code
## DATA PREPARATION
## Extracting all the metadata for the subsecuent analysis
## There are no other metadata dataframes than the extracted here
## Set working directory
setwd("/home/chombo/Documents/SEGOVIA/CAZymes_PULs/data/")
## Read the MAGs metadata and extract the columns: class, order, family, ecosystem
MAGS.meta <- read.table(file = "1069_MAGs_metadata.tsv",sep = "\t",header = TRUE,row.names = 1)
MAGS.meta <- MAGS.meta %>%
select(class,order,family,ecosystem_category)
colnames(MAGS.meta) <- c("Class","Order","Family","Ecosystem")
MAGS.meta$Ecosystem <- str_replace(MAGS.meta$Ecosystem," ","_")
## Read the PULs per MAG metadata
PULSperMAG <- read.table(file = "PULsperMAG.tsv",sep = "\t",header = FALSE,row.names = 1)
colnames(PULSperMAG) <- c("PULs")
## Merge both data frames
MAGS.meta <- merge(MAGS.meta, PULSperMAG,
by = 'row.names', all = TRUE)
## Import the data for a matrix
MAGS.arr <- as.vector(read.table("1069MAGS_list.txt"))[[1]]
## Extract the PULs that are present >= 25 MAGs
all.puls <- data.frame()
for (mag in MAGS.arr) {
## Filter the 5 MAGs that do not have significant PULs associated with them
if (mag == "3300005326_35" || mag == "3300005370_1" || mag == "3300017650_9" || mag == "3300027284_65" || mag == "3300027607_50") {
next
}
df.temp <- read.table(file = paste(mag,"/",mag,".matches_counts.out",sep = ""),sep = "\t",)
colnames(df.temp) <- c("pul","frequency")
df.temp["mag"] <- mag
df.temp <- df.temp %>%
select(mag,pul,everything())
## Create a dataframe with the nodes: mag = from, pul = to, freq = value
all.puls <- rbind(all.puls,df.temp)
}
## Extract the PULs counts per MAG
mags.data <- read.table(file = "PULsperMAG.tsv",sep = "\t",header = FALSE)
colnames(mags.data) <- c("mag","Freq")
# summary(mags.data$Freq)
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# 0.00 27.00 48.00 57.67 73.00 273.00
mags.data.cut <- mags.data[mags.data$Freq >= 48,]
# length(mags.data.cut$mag)
# [1] 240
filtered.mags.vec <- as.vector(mags.data.cut$mag)
## Extract PULs diversity
puls.data <- read.table(file = "PULS.diversity.tsv",sep = "\t",header = FALSE)
colnames(puls.data) <- c("puls","Freq")
# summary(puls.data$Freq)
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# 1.00 2.00 6.00 26.54 19.50 988.00
puls.data.cut <- puls.data[puls.data$Freq >= 27,]
# length(puls.data.cut$puls)
# [1] 285
filtered.puls.vec <- as.vector(puls.data.cut$puls)
# ===================================================================
## At the end we will be working with a matrix 445 MAGs x 303 PULs
# ===================================================================
## UNIVERSAL MATRIX GENERATION
## Use the current data
## Filter the original data by the filtered vectors
matrix.data <- all.puls[is.element(all.puls$mag,filtered.mags.vec),]
matrix.data <- matrix.data[is.element(matrix.data$pul,filtered.puls.vec),]
## Generate the matrix
mtx <- matrix.data %>%
spread(pul, frequency) %>%
column_to_rownames('mag') %>%
as.matrix() %>%
replace(is.na(.), 0)
CAZymes code
## DATA PREPARATION
## Extracting all the metadata for the subsecuent analysis
## There are no other metadata dataframes than the extracted here
## Set working directory
setwd("/home/chombo/Documents/SEGOVIA/CAZymes_PULs/data/")
## Read the MAGs metadata and extract the columns: class, order, family, ecosystem
MAGS.meta <- read.table(file = "1069_MAGs_metadata.tsv",sep = "\t",header = TRUE,row.names = 1)
MAGS.meta <- MAGS.meta %>%
select(class,order,family,ecosystem_category)
colnames(MAGS.meta) <- c("Class","Order","Family","Ecosystem")
MAGS.meta$Ecosystem <- str_replace(MAGS.meta$Ecosystem," ","_")
## Read the PULs per MAG metadata
CAZYSperMAG <- read.table(file = "CAZysperMAG.tsv",sep = "\t",header = FALSE,row.names = 1)
colnames(CAZYSperMAG) <- c("CAZys")
## Merge both data frames
MAGS.meta <- merge(MAGS.meta, CAZYSperMAG,
by = 'row.names', all = TRUE)
## Import the data for a matrix
MAGS.arr <- as.vector(read.table("1069MAGS_list.txt"))[[1]]
## Extract the CAZys
all.cazys <- data.frame()
for (mag in MAGS.arr) {
## Filter the 5 MAGs that do not have significant CAZys associated with them
if (mag == "3300007500_43" || mag == "3300011404_8" || mag == "3300012067_9" || mag == "3300012151_17" || mag == "3300012165_16" || mag == "3300007500_43") {
next
}
df.temp <- read.table(file = paste(mag,"/output_dbcan4/",mag,".CAZyCounts.single.txt",sep = ""),sep = "\t",)
colnames(df.temp) <- c("cazy","frequency")
df.temp["mag"] <- mag
df.temp <- df.temp %>%
select(mag,cazy,everything())
## Create a dataframe with the nodes: mag = from, cazy = to, freq = value
all.cazys <- rbind(all.cazys,df.temp)
}
## Extract the CAZys counts per MAG
mags.data <- read.table(file = "CAZysperMAG.tsv",sep = "\t",header = FALSE)
colnames(mags.data) <- c("mag","Freq")
# summary(mags.data$Freq)
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# 0.00 27.00 48.00 57.67 73.00 273.00
mags.data.cut <- mags.data[mags.data$Freq >= 48,]
# length(mags.data.cut$Var1)
# [1] 539
filtered.mags.vec <- as.vector(mags.data.cut$mag)
## Extract CAZys diversity
cazys.data <- read.table(file = "CAZYS.diversity.tsv",sep = "\t",header = FALSE)
colnames(cazys.data) <- c("cazy","Freq")
# summary(cazys.data$Freq)
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# 1.0 6.0 27.0 145.4 108.2 5928.0
cazys.data.cut <- cazys.data[cazys.data$Freq >= 27,]
# length(cazys.data.cut$Var1)
# [1] 214
filtered.cazys.vec <- as.vector(cazys.data.cut$cazy)
# ===================================================================
## At the end we will be working with a matrix 539 MAGs x 214 CAZys
# ===================================================================
## UNIVERSAL MATRIX GENERATION
## Use the current data
## Filter the original data by the filtered vectors
matrix.data <- all.cazys[is.element(all.cazys$mag,filtered.mags.vec),]
matrix.data <- matrix.data[is.element(matrix.data$cazy,filtered.cazys.vec),]
## Generate the matrix
mtx <- matrix.data %>%
spread(cazy, frequency) %>%
column_to_rownames('mag') %>%
as.matrix() %>%
replace(is.na(.), 0)
After analyzing the obtained clusters, we decided to trim the data, staying only with the most significant clusters. This was only made with the PULs (since the PULs is our object of study) and the suggested approach is that the PULs cutoff PCoA distribution will be similar in the CAZymes with the same MAGs. Here is the code where we trim by ‘Class’, ‘Order’ and ‘Family’. It is important to remark that the ‘Ecosystem’ distribution was poorly well distributed. At the end, there were only 343 MAGs left. And those are the ones we will be working with from now on.
Ecosystem before filtering
Ecosystem after filtering
Class before filtering
Class after filtering
Order before filtering
Order after filtering
Family before filtering
Family after filtering
Ecosystem before filtering
Ecosystem after filtering
Class before filtering
Class after filtering
Order before filtering
Order after filtering
Family before filtering
Family after filtering
After the final filtering. The 343 MAGs were stored in a matrix
against the PULs and the CAZymes to see how were they distributed. For
that, hclust()
function was used as well as the
vegdist(mtx,"bray")
function for obtaining the distances.
Here are the obtained distributions.
## Libraries
suppressPackageStartupMessages(library(dplyr))
suppressPackageStartupMessages(library(pheatmap))
suppressPackageStartupMessages(library(tidyr))
suppressPackageStartupMessages(library(textshape))
suppressPackageStartupMessages(library(stringr))
suppressPackageStartupMessages(library(ggplot2))
suppressPackageStartupMessages(library(vegan))
suppressPackageStartupMessages(library(dendextend))
suppressPackageStartupMessages(library(circlize))
## FUNCTIONS
## EXTRACT CAZYMES
get.cazymes <- function(mags) {
all.cazys <- data.frame()
for (mag in mags) {
## Filter the 5 MAGs that do not have significant PULs associated with them
if (mag == "3300007500_43" || mag == "3300011404_8" || mag == "3300012067_9" || mag == "3300012151_17" || mag == "3300012165_16" || mag == "3300007500_43") {
next
}
df.temp <- read.table(file = paste(mag,"/output_dbcan4/",mag,".CAZyCounts.single.txt",sep = ""),sep = "\t",)
colnames(df.temp) <- c("cazy","frequency")
df.temp["mag"] <- mag
df.temp <- df.temp %>%
select(mag,cazy,everything())
## Create a dataframe with the nodes: mag = from, cazy = to, freq = value
all.cazys <- rbind(all.cazys,df.temp)
}
return(all.cazys)
}
## EXTRACT PULS
get.puls <- function(mags) {
all.puls <- data.frame()
for (mag in mags) {
# ## Filter the 5 MAGs that do not have significant PULs associated with them
if (mag == "3300005326_35" || mag == "3300005370_1" || mag == "3300017650_9" || mag == "3300027284_65" || mag == "3300027607_50") {
next
}
df.temp <- read.table(file = paste(mag,"/",mag,".matches_counts.out",sep = ""),sep = "\t",)
colnames(df.temp) <- c("pul","frequency")
df.temp["mag"] <- mag
df.temp <- df.temp %>%
select(mag,pul,everything())
## Create a dataframe with the nodes: mag = from, pul = to, freq = value
all.puls <- rbind(all.puls,df.temp)
}
return(all.puls)
}
## DATA PREPARATION
## Extracting all the metadata for the subsecuent analysis
## There are no other metadata dataframes than the extracted here
## Set working directory
setwd("/home/chombo/Documents/SEGOVIA/CAZymes_PULs/data/")
## Read the MAGs metadata and extract the columns: class, order, family, ecosystem
MAGS.meta <- read.table(file = "1069_MAGs_metadata.tsv",sep = "\t",header = TRUE,row.names = 1)
MAGS.meta <- MAGS.meta %>%
select(PULsFrequency,CAZysFrequency,class,order,family,specie,ecosystem_category)
colnames(MAGS.meta) <- c("PULS","CAZYS","Class","Order","Family","Specie","Ecosystem")
MAGS.meta$Ecosystem <- str_replace(MAGS.meta$Ecosystem," ","_")
MAGS.meta["mag"] <- rownames(MAGS.meta)
## Read the 343 MAGs new list
MAGS.arr <- as.vector(read.table("343_MAGslist.txt"))[[1]]
## Filter metadata
MAGS.meta <- MAGS.meta[is.element(MAGS.meta$mag,MAGS.arr),]
## Extract the counts
all.cazys <- get.cazymes(mags = MAGS.arr)
## Create the matrix
mtx <- all.cazys %>%
spread(cazy, frequency) %>%
column_to_rownames('mag') %>%
as.matrix() %>%
replace(is.na(.), 0)
#mtx <- vegdist(mtx,"bray")
dend <- as.dendrogram(hclust(vegdist(mtx,"bray")))
## Colors
dend <- dend %>%
color_branches(k = 19) %>%
raise.dendrogram (15) %>%
hang.dendrogram()
## Control quality plot
ggplot(dend, labels = FALSE) + scale_y_reverse(expand = c(0.2, 0)) + coord_polar(theta="x")
png(filename = "CAZy_dend_test.png",width = 4000,height = 2000,res = 100)
factoextra::fviz_dend(dend, cex = 0.5, k = 20,
color_labels_by_k = TRUE, rect = TRUE)
dev.off()
## Write the tree
phy <- ape::as.phylo(hclust(vegdist(mtx,"bray")))
ape::write.tree(phy = phy,file='CAZy343_ward_Class.tree')
## Repeat the whole process with PULs
all.puls <- get.puls(mags = MAGS.arr)
## Create the matrix
mtx <- all.puls %>%
spread(pul, frequency) %>%
column_to_rownames('mag') %>%
as.matrix() %>%
replace(is.na(.), 0)
#mtx <- vegdist(mtx,"bray")
dend <- as.dendrogram(hclust(vegdist(mtx,"bray")))
## Colors
dend <- dend %>%
color_branches(k = 19) %>%
raise.dendrogram (15) %>%
hang.dendrogram()
## Control quality plot
ggplot(dend, labels = FALSE) + scale_y_reverse(expand = c(0.2, 0)) + coord_polar(theta="x")
png(filename = "PULs_dend_test.png",width = 4000,height = 2000,res = 100)
factoextra::fviz_dend(dend, cex = 0.5, k = 20,
color_labels_by_k = TRUE, rect = TRUE)
dev.off()
## Write the tree
phy <- ape::as.phylo(hclust(vegdist(mtx,"bray")))
ape::write.tree(phy = phy,file='PULs343_ward_Class.tree')
PULs
CAZymes
The distribution fo CAZyme families per Family.
Here are the substrates per Family identified for the PULs. The cutoff for the substrates was >= 100 counts from total. (A cutoff by the median was also evaluated. There is no meaningful difference between the cutoff by 100 and by the median).
## SUBSTRATES
## PULS
## Load the PULs substrate dataset
puls.substrate <- read_excel("PULSUB_05102023.xlsx")
puls.substrate <- puls.substrate %>%
select(PULID,substrate_final,cazymes_predicted_dbcan)
colnames(puls.substrate) <- c("pul","substrate","predictedCAZY")
## CLEAN MAGS BY THEIR SUBSTRATE
all.puls <- merge(all.puls,puls.substrate, by = "pul",all.x = TRUE) %>%
select(!c("predictedCAZY"))
all.puls$substrate <- replace_na(all.puls$substrate,"Unknown")
all.puls <- all.puls %>%
filter(substrate != "Unknown")
# ================================================================================
## Unknown PULs = 7812
## Left PULs = 7938
## MAGs = 343
## Substrates = 143 w/Unknown
# ================================================================================
substrates.unique <- all.puls %>%
filter(substrate != "Unknown") %>%
group_by(substrate) %>%
summarize(counts = sum(frequency))
## summary(substrates.unique$counts)
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# 1.00 7.00 24.50 66.92 74.75 651.00
substrates.unique <- substrates.unique %>%
filter(counts >= 24)
substrates.arr <- c(substrates.unique$substrate)
all.puls <- all.puls[is.element(all.puls$substrate,substrates.arr),]
## CAZYMES
## Working with CAZymes is a little bit different because they have their own substrate binding module
## i.e., they are associated with an specific substrate. That is why the approach will be to group in
## a list all the possible substrates and substrate binding modules by each of the 175 available CAZymes
## in the 343 MAGs
## Load the CAZymes substrate dataset
cazymes.substrate <- read.table("CAZYSUB_08252022.tsv", sep = "\t", header = TRUE)
cazymes.substrate <- cazymes.substrate %>%
select(Family,Substrate_high_level,Name)
colnames(cazymes.substrate) <- c("cazy","substrate","enzyme")
## CLEAN MAGS BY THEIR SUBSTRATE
## Create an array with te unique CAZymes of the 343 MAGs
all.cazys.copy <- all.cazys
all.cazys.copy$cazy <- gsub("_\\d+","",all.cazys.copy$cazy)
## Remove the GTs
all.cazys.copy <- all.cazys.copy %>%
filter(!grepl("GT",cazy))
unique.cazys <- c(sort(unique(all.cazys.copy$cazy)))
## Create a list with the metadata of all the unique CAZymes
cazys.sub.list <- list()
cazy.sub.unknown <- c()
for (i in unique.cazys) {
df.temp <- subset(cazymes.substrate,is.element(cazymes.substrate$cazy,i))
cazys.sub.list[[i]] <- df.temp
## Store the CAZymes with unknown substrates for further analysis
if (length(row.names(df.temp)) == 0) {
cazy.sub.unknown <- c(cazy.sub.unknown,i)
}
}
## Some CAZymes were not found in the substrate dataset
## They were searched manually
cazys.sub.list[["PL27"]] <- data.frame(cazy = c("PL27","PL27"),
substrate = c("L-rhamnose","D-gluconarate"),
enzyme = c("L-rhamnose-α-1","4-D-glucuronate lyase"))
cazys.sub.list[["PL35"]] <- data.frame(cazy = c("PL35"),
substrate = c("chondroitin"),
enzyme = c("Chondroitin lyase"))
cazys.sub.list[["PL40"]] <- data.frame(cazy = c("PL40"),
substrate = c("ulvan"),
enzyme = c("Ulvan lyase"))
## Merge the CAZymes with the substrates
## Extract the substrates from each cazy data frame and merge them so they can be incorporated to the all.cazys data frame
trans.cazys.sub <- data.frame()
for (cazy in unique.cazys) {
sub.vec <- sort(unique(cazys.sub.list[[cazy]]$substrate))
sub.str <- paste(sub.vec,collapse = ",")
trans.cazys.sub <- rbind(trans.cazys.sub,data.frame(cazy = cazy,substrate = sub.str))
}
## Merge the substrates with the all.cazys data frame
all.cazys.copy <- merge(all.cazys.copy,trans.cazys.sub,
by = "cazy",all.x = TRUE)
# ================================================================================
## GT CAZymes = 6647
## Left CAZymes = 11547
## MAGs = 343
## Substrates = 87
# ================================================================================
## Filter the substrates by their median
substrates.unique <- all.cazys.copy %>%
group_by(substrate) %>%
summarize(counts = sum(frequency))
# summary(substrates.unique$counts)
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# 1.0 9.5 54.0 132.7 137.0 1036.0
substrates.unique <- substrates.unique %>%
filter(counts >= 54)
substrates.arr <- c(substrates.unique$substrate)
all.cazys.copy <- all.cazys.copy[is.element(all.cazys.copy$substrate,substrates.arr),]
PULs substrates were obtained by merging the substrate metadata got from dbCAN. The cutoff value was 24 which corresponds to the median of the PULs frequency.
CAZymes substrates were obtained by merging the substrate metadata got from dbCAN and then exclude the GT family. The cutoff value was 54 which corresponds to the median of the CAZymes frequency.