suppressPackageStartupMessages({
library(readxl)
library(readr)
library(limma)
library(stringr)
library(rvest)
library(ggrepel)
library(kableExtra)
library(tidyverse)
library(POMA)
library(metabolomicsWorkbenchR)
library(SummarizedExperiment)
library(BiocFileCache)
library(enrichmet)
})
## Warning: package 'readxl' was built under R version 4.4.3
## Warning: package 'stringr' was built under R version 4.4.3
## Warning: package 'rvest' was built under R version 4.4.3
## Warning: package 'ggplot2' was built under R version 4.4.3
## Warning: package 'tibble' was built under R version 4.4.3
## Warning: package 'purrr' was built under R version 4.4.3
## Warning: package 'forcats' was built under R version 4.4.3
## Warning: package 'matrixStats' was built under R version 4.4.3
## Warning: package 'dbplyr' was built under R version 4.4.3
This document contains the preprocessing and analysis of two metabolomics datasets that will be used for benchmarking several enrichment analysis packages.
The datasets are:
This is an example introduced by Xuia et alt. in MetaboAnalyst and is used in most tutorials of this software.
The dataset is available as a .csv file in the metaboData repository in github (https://github.com/nutrimetabolomics/metaboData/blob/main/Datasets/2024-Cachexia/human_cachexia.csv). For implicity it has been downladed from there and re-stored as an Excel File.
human_cachexia <- read_excel("data/human_cachexia.xlsx")
str(human_cachexia)
## tibble [63 × 78] (S3: tbl_df/tbl/data.frame)
## $ Metabolite : chr [1:63] "1,6-Anhydro-beta-D-glucose" "1-Methylnicotinamide" "2-Aminobutyrate" "2-Hydroxyisobutyrate" ...
## $ cachPIF_178 : num [1:63] 40.9 65.4 18.7 26.1 71.5 ...
## $ cachPIF_087 : num [1:63] 62.2 340.4 24.3 41.7 67.4 ...
## $ cachPIF_090 : num [1:63] 270.4 64.7 12.2 65.4 23.8 ...
## $ cachNETL_005_V1 : num [1:63] 154.5 53 172.4 74.4 1199.9 ...
## $ cachPIF_115 : num [1:63] 22.2 73.7 15.6 83.9 33.1 ...
## $ cachPIF_110 : num [1:63] 212.7 31.8 18.4 80.6 47.9 ...
## $ cachNETL_019_V1 : num [1:63] 151.41 36.6 8.67 42.52 223.63 ...
## $ cachNETCR_014_V1: num [1:63] 31.5 6.82 4.18 12.94 25.03 ...
## $ cachNETCR_014_V2: num [1:63] 51.42 30.27 7.54 34.81 80.64 ...
## $ cachPIF_154 : num [1:63] 117.9 52.5 19.5 72.2 73.7 ...
## $ cachNETL_022_V1 : num [1:63] 20.7 221.4 15.2 28.8 357.8 ...
## $ cachNETL_022_V2 : num [1:63] 127.7 177.7 12.7 15 68 ...
## $ cachNETL_008_V1 : num [1:63] 59.74 50.91 6.82 46.06 111.05 ...
## $ cachPIF_146 : num [1:63] 89.1 32.8 10.4 32.1 32.5 ...
## $ cachPIF_119 : num [1:63] 23.57 6.89 2.12 7.85 8.33 ...
## $ cachPIF_099 : num [1:63] 41.26 8.67 2.56 7.85 6.89 ...
## $ cachPIF_162 : num [1:63] 589.9 22 15.2 46.1 32.8 ...
## $ cachPIF_160 : num [1:63] 112.2 25.3 15.5 47.9 28.8 ...
## $ cachPIF_113 : num [1:63] 167.3 19.9 13.5 31.2 47.9 ...
## $ cachPIF_143 : num [1:63] 183.09 90.92 8.94 64.07 20.49 ...
## $ cachNETCR_007_V1: num [1:63] 208.51 53.52 5.26 47.94 212.72 ...
## $ cachNETCR_007_V2: num [1:63] 34.8 95.6 23.6 68 287.1 ...
## $ cachPIF_137 : num [1:63] 333.62 35.87 7.92 54.6 20.49 ...
## $ cachPIF_100 : num [1:63] 32.46 9.68 3.9 11.02 170.72 ...
## $ cachNETL_004_V1 : num [1:63] 4.71 11.13 43.38 30.88 104.58 ...
## $ cachPIF_094 : num [1:63] 68.7 13.9 12.2 25 28.2 ...
## $ cachPIF_132 : num [1:63] 214.9 127.7 31.5 33.8 88.2 ...
## $ cachPIF_163 : num [1:63] 304.9 25.8 27.1 40.5 70.8 ...
## $ cachNETCR_003_V1: num [1:63] 37.71 10.8 5 8.25 11.7 ...
## $ cachNETL_028_V1 : num [1:63] 45.6 473.4 16.3 63.4 221.4 ...
## $ cachNETL_028_V2 : num [1:63] 34.12 92.76 8.25 16.61 55.15 ...
## $ cachNETCR_013_V1: num [1:63] 107.8 16.6 26.8 32.5 62.8 ...
## $ cachNETL_020_V1 : num [1:63] 13.33 50.91 2.92 40.85 46.99 ...
## $ cachNETL_020_V2 : num [1:63] 27.9 80.6 15.8 64.7 88.2 ...
## $ cachPIF_192 : num [1:63] 141.2 68 40.9 12.8 26.1 ...
## $ cachNETCR_012_V1: num [1:63] 14 46.1 29.1 24.5 64.1 ...
## $ cachNETCR_012_V2: num [1:63] 244.7 116.8 40 61.6 174.2 ...
## $ cachPIF_089 : num [1:63] 124 81.5 55.1 70.8 92.8 ...
## $ cachNETCR_002_V1: num [1:63] 141.2 28.5 20.3 14.3 97.5 ...
## $ cachPIF_179 : num [1:63] 35.16 26.58 5.21 30.27 7.39 ...
## $ cachPIF_114 : num [1:63] 685.4 36.2 32.5 85.6 25 ...
## $ cachNETCR_006_V1: num [1:63] 278.7 40.5 55.1 51.4 74.4 ...
## $ cachPIF_141 : num [1:63] 15.8 23.6 18 37.3 21.3 ...
## $ cachNETCR_025_V1: num [1:63] 29.96 96.54 6.55 65.37 1053.63 ...
## $ cachNETCR_025_V2: num [1:63] 16.95 114.43 2.53 77.48 2465.13 ...
## $ cachNETCR_016_V1: num [1:63] 292.9 58 167.3 82.3 468.7 ...
## $ cachPIF_116 : num [1:63] 29.67 70.11 5.58 18.73 5.53 ...
## $ ctrlPIF_191 : num [1:63] 18.92 24.53 3.29 10.49 9.68 ...
## $ ctrlPIF_164 : num [1:63] 127.74 1032.77 8.58 66.02 38.09 ...
## $ ctrlNETL_013_V1 : num [1:63] 34.81 12.3 5.87 15.18 16.78 ...
## $ ctrlPIF_188 : num [1:63] 65.37 24.05 4.71 15.8 7.24 ...
## $ ctrlPIF_195 : num [1:63] 15.18 94.63 11.36 8.17 5.64 ...
## $ ctrlNETCR_015_V1: num [1:63] 70.8 75.9 22.6 61 230.4 ...
## $ ctrlPIF_102 : num [1:63] 25.28 101.49 8.33 59.15 88.23 ...
## $ ctrlNETL_010_V1 : num [1:63] 34.47 12.81 3.78 8.33 14.3 ...
## $ ctrlNETL_010_V2 : num [1:63] 18.54 8.41 3.78 4.85 8.08 ...
## $ ctrlNETL_001_V1 : num [1:63] 37.34 55.15 7.39 36.23 75.94 ...
## $ ctrlNETCR_015_V2: num [1:63] 33.8 53.5 18.2 46.5 81.5 ...
## $ ctrlNETCR_005_V1: num [1:63] 22.4 55.1 20.7 38.5 164 ...
## $ ctrlPIF_111 : num [1:63] 146.9 10.1 6.3 27.9 24.1 ...
## $ ctrlPIF_171 : num [1:63] 64.07 6.42 28.79 18.92 85.63 ...
## $ ctrlNETCR_008_V1: num [1:63] 32.46 14.01 2.97 5.16 8.08 ...
## $ ctrlNETCR_008_V2: num [1:63] 113.3 43.38 4.66 27.11 22.42 ...
## $ ctrlNETL_017_V1 : num [1:63] 22.2 20.7 7.85 19.69 38.47 ...
## $ ctrlNETL_017_V2 : num [1:63] 46.53 9.78 3.1 9.3 10.59 ...
## $ ctrlNETL_002_V1 : num [1:63] 192.48 108.85 7.77 46.06 55.15 ...
## $ ctrlNETL_002_V2 : num [1:63] 528.5 225.9 13.5 93.7 230.4 ...
## $ ctrlPIF_190 : num [1:63] 28.79 9.21 5.53 17.64 14.44 ...
## $ ctrlNETCR_009_V1: num [1:63] 181.27 48.42 8.94 51.94 982.4 ...
## $ ctrlNETCR_009_V2: num [1:63] 47.47 7.69 4.06 9.3 65.37 ...
## $ ctrlNETL_007_V1 : num [1:63] 15.96 16.12 1.93 15.8 25.28 ...
## $ ctrlPIF_112 : num [1:63] 22.87 10.38 1.28 5.58 8.5 ...
## $ ctrlNETCR_019_V2: num [1:63] 35.2 52.5 13.9 44.3 99.5 ...
## $ ctrlNETL_012_V1 : num [1:63] 16.9 15.8 10.5 22.4 62.8 ...
## $ ctrlNETL_012_V2 : num [1:63] 9.39 14.01 5.16 23.57 46.99 ...
## $ ctrlNETL_003_V1 : num [1:63] 37.7 18.2 26.1 15 23.3 ...
## $ ctrlNETL_003_V2 : num [1:63] 38.5 12.6 15 12.6 22.2 ...
This dataset contains only metabolites that have been shown to be significantly different among the groups.
This is confirmed by the analysis performed in the following.
# 1. Definir el vector de grups a partir dels noms de les columnes
sample_names <- colnames(human_cachexia)[-1] # excepte 'Metabolite'
group <- ifelse(str_detect(sample_names, "^ctrl"), "Control", "Cachexia")
group <- factor(group)
# 2. Construir la matriu d'expressió (metabolits x mostres)
X <- as.matrix(human_cachexia[ , -1])
rownames(X) <- human_cachexia$Metabolite
# 3. Disseny experimental
design <- model.matrix(~ 0+group)
# 4. Ajustar model amb limma
fit <- lmFit(X, design)
fit <- eBayes(fit)
# 5. Resultats
res <- topTable(fit, n=Inf)
tail(res)
# 6. Mostrar els primers resultats
print(res)
## groupCachexia groupControl AveExpr F
## 2-Hydroxyisobutyrate 43.237660 27.871000 37.250649 102.108709
## Creatinine 10722.140213 5619.174667 8733.971818 88.727152
## Quinolinate 83.747234 39.324000 66.439481 85.721150
## Valine 45.582553 20.132667 35.667013 72.840052
## Dimethylamine 453.580638 208.683333 358.166104 68.123894
## Tryptophan 81.824043 41.833000 66.243117 65.057316
## Methylamine 21.216383 11.360000 17.376234 63.658759
## Pyroglutamate 270.292340 119.258000 211.447792 62.106598
## Leucine 31.261702 13.556667 24.363636 62.078871
## Asparagine 75.390851 41.749000 62.283636 59.785143
## N,N-Dimethylglycine 34.489787 13.596667 26.349610 58.618084
## tau-Methylhistidine 105.667660 64.650333 89.686883 58.028331
## Threonine 118.233191 59.518667 95.357403 56.120211
## Betaine 112.252979 55.970333 90.324675 56.039351
## Alanine 347.591064 157.584000 273.562338 55.825450
## 3-Indoxylsulfate 265.157660 146.376333 218.879221 55.619789
## Fucose 108.599149 57.444667 88.668831 55.197137
## Glutamine 391.410426 174.427333 306.871558 55.150421
## Serine 245.829787 122.263000 197.686883 53.127289
## Ethanolamine 326.772128 197.125333 276.260390 51.936097
## Citrate 2720.852766 1474.718667 2235.345974 47.687718
## Glycolate 219.269574 138.983667 187.989351 45.626282
## trans-Aconitate 48.814043 27.809333 40.630390 45.389616
## Hypoxanthine 67.087021 51.714333 61.097662 44.369447
## Guanidoacetate 97.624255 68.739667 86.370519 43.688379
## Tyrosine 100.742340 52.014000 81.757273 43.591865
## Isoleucine 9.660851 7.218000 8.709091 40.944145
## Histidine 364.232340 180.472333 292.637532 40.085486
## Uracil 37.513617 32.493333 35.557662 39.174458
## Glycine 1069.377872 585.149333 880.717403 37.753813
## 3-Hydroxybutyrate 29.260638 9.898667 21.717013 34.620652
## 3-Hydroxyisovalerate 27.606383 12.312667 21.647792 34.011973
## Acetate 85.632979 35.604667 66.141429 33.488539
## 4-Hydroxyphenylacetate 119.822553 99.798667 112.021039 33.419405
## myo-Inositol 181.837660 62.641333 135.397532 32.567554
## Pyruvate 26.865532 12.566333 21.294416 29.607625
## Lysine 121.282340 89.229333 108.794156 29.314284
## Hippurate 2875.729574 1364.240333 2286.837662 28.826437
## Formate 187.564468 84.483333 147.402987 28.648573
## 1,6-Anhydro-beta-D-glucose 128.688936 69.505333 105.630390 28.612486
## cis-Aconitate 276.025532 91.724000 204.219740 27.616880
## Taurine 655.720000 320.522333 525.123506 27.193100
## Succinate 79.628936 29.836000 60.229091 24.236966
## Trimethylamine N-oxide 820.340638 388.669000 652.156883 23.010912
## Trigonelline 359.637660 130.687000 270.436104 22.529110
## Methylguanidine 17.364681 12.128333 15.324545 22.429370
## Carnitine 64.622128 32.443667 52.085065 21.662941
## pi-Methylhistidine 441.553191 258.640000 370.288312 20.356000
## 2-Aminobutyrate 23.669149 9.528333 18.159740 19.570994
## O-Acetylcarnitine 25.564468 10.598000 19.733377 14.769399
## Fumarate 10.921915 4.552000 8.440130 13.782101
## Adipate 34.817872 8.993333 24.756364 12.299719
## 1-Methylnicotinamide 70.564255 73.155000 71.573636 11.074201
## Creatine 174.913404 51.504333 126.831948 10.655932
## Pantothenate 39.944043 52.622667 44.883766 10.485120
## Acetone 13.346383 8.420000 11.427013 9.177920
## Glucose 827.218936 140.958000 559.844545 9.022591
## Sucrose 150.024468 55.579667 113.227792 8.825967
## 2-Oxoglutarate 183.110426 85.517333 145.087143 7.779821
## 3-Aminoisobutyrate 100.274681 39.911000 76.756364 7.280801
## Xylose 129.289149 56.509333 100.933377 7.162336
## Lactate 217.631915 65.748333 158.456494 6.860428
## Tartrate 47.234681 28.676000 40.004026 6.063205
## P.Value adj.P.Val
## 2-Hydroxyisobutyrate 3.112959e-22 1.961164e-20
## Creatinine 1.400976e-20 4.413076e-19
## Quinolinate 3.481955e-20 7.312105e-19
## Valine 2.252837e-18 3.548218e-17
## Dimethylamine 1.172135e-17 1.476890e-16
## Tryptophan 3.564558e-17 3.742786e-16
## Methylamine 5.985424e-17 5.386882e-16
## Pyroglutamate 1.072912e-16 7.589719e-16
## Leucine 1.084246e-16 7.589719e-16
## Asparagine 2.612755e-16 1.646035e-15
## N,N-Dimethylglycine 4.120067e-16 2.359675e-15
## tau-Methylhistidine 5.197250e-16 2.728556e-15
## Threonine 1.112844e-15 5.173767e-15
## Betaine 1.149726e-15 5.173767e-15
## Alanine 1.253462e-15 5.264542e-15
## 3-Indoxylsulfate 1.362272e-15 5.363946e-15
## Fucose 1.617376e-15 5.769520e-15
## Glutamine 1.648434e-15 5.769520e-15
## Serine 3.791818e-15 1.257287e-14
## Ethanolamine 6.246496e-15 1.967646e-14
## Citrate 3.916828e-14 1.175048e-13
## Glycolate 9.866970e-14 2.825541e-13
## trans-Aconitate 1.098708e-13 3.009504e-13
## Hypoxanthine 1.752710e-13 4.600864e-13
## Guanidoacetate 2.401729e-13 6.052356e-13
## Tyrosine 2.511910e-13 6.086552e-13
## Isoleucine 8.782360e-13 2.049217e-12
## Histidine 1.329937e-12 2.992358e-12
## Uracil 2.075990e-12 4.509909e-12
## Glycine 4.201570e-12 8.823297e-12
## 3-Hydroxybutyrate 2.087031e-11 4.241385e-11
## 3-Hydroxyisovalerate 2.872418e-11 5.655073e-11
## Acetate 3.788622e-11 7.232824e-11
## 4-Hydroxyphenylacetate 3.930314e-11 7.282641e-11
## myo-Inositol 6.197112e-11 1.115480e-10
## Pyruvate 3.150939e-10 5.514144e-10
## Lysine 3.716309e-10 6.327769e-10
## Hippurate 4.897819e-10 8.120069e-10
## Formate 5.419160e-10 8.712437e-10
## 1,6-Anhydro-beta-D-glucose 5.531706e-10 8.712437e-10
## cis-Aconitate 9.796718e-10 1.505349e-09
## Taurine 1.252803e-09 1.879204e-09
## Succinate 7.291636e-09 1.068309e-08
## Trimethylamine N-oxide 1.551746e-08 2.221818e-08
## Trigonelline 2.096638e-08 2.935294e-08
## Methylguanidine 2.232087e-08 3.056989e-08
## Carnitine 3.623601e-08 4.857167e-08
## pi-Methylhistidine 8.400344e-08 1.102545e-07
## 2-Aminobutyrate 1.404706e-07 1.806051e-07
## O-Acetylcarnitine 3.832963e-06 4.829533e-06
## Fumarate 7.852109e-06 9.699664e-06
## Adipate 2.365777e-05 2.866230e-05
## 1-Methylnicotinamide 6.036273e-05 7.175193e-05
## Creatine 8.354921e-05 9.747408e-05
## Pantothenate 9.548709e-05 1.093761e-04
## Acetone 2.696052e-04 3.033059e-04
## Glucose 3.055788e-04 3.377450e-04
## Sucrose 3.582931e-04 3.891804e-04
## 2-Oxoglutarate 8.451408e-04 9.024385e-04
## 3-Aminoisobutyrate 1.281514e-03 1.345590e-03
## Xylose 1.415587e-03 1.461999e-03
## Lactate 1.826335e-03 1.855792e-03
## Tartrate 3.608916e-03 3.608916e-03
save(res, file="data/Cachexia_topTable.Rda")
write.table(rownames(res), file="data/cachexia_ids.txt", row.names = FALSE, quote=FALSE)
Metabolites in the table are identified by their common names, a type of identifier very useful for scientists working with these metabolites at chemical or nutritional level. However, most enrichment analysis tools require a more objective ID from public databases such as PubChem, HumanDB or KEGG. The associated identifiers for these metabolites, when available, have been obtained using MetaboAnalyst Compound ID converter.
cachexia_map <- read.csv("data/Cachexia_map.csv")
head(cachexia_map)
The obtained identifiers may be used as input for enrichment analysis.
This dataset comes from a study that has bee uploaded to Metabolomics Workbench with ID ST0000291: LC-MS Based Approaches to Investigate Metabolomic Differences in the Urine of Young Women after Drinking Cranberry Juice or Apple Juice
Once downloaded the data consists of three separate files
The metabolite names - 1541 row, 3 columns (original name, PubChem ID and KEGG ID)
Indeed the downloaded data are stored in a particular format “MWTab” so some functions to extract data from it have been prepared.
WARNING
Although the original study followed a 2×2 crossover design, the Metabolomics Workbench dataset (ST000291) does not include subject identifiers, and therefore paired or repeated-measures models cannot be applied. As a result, treatment groups must be analysed as independent samples.
This choice is conservative: ignoring the within-subject pairing increases residual variance and therefore raises the probability of false negatives, not false positives. Consequently, any metabolites that remain significant under this independent-samples analysis are likely to represent strong and biologically robust treatment effects.
Data in Metabolomics Workbench is not preprocessed so some work has to be performed before they can be analyzed.
The preprocessing steps are performed using the POMA Bioconductor package.
data_negative_mode <- do_query(
context = "study",
input_item = "analysis_id",
input_value = "AN000465",
output_item = "SummarizedExperiment")
data_positive_mode <- do_query(
context = "study",
input_item = "analysis_id",
input_value = "AN000464",
output_item = "SummarizedExperiment")
## ----metabolitenames, message = FALSE, warning = FALSE, comment = FALSE, echo = FALSE, fig.align = "center", out.width = "100%", fig.cap = 'Metabolite identifiers of the ST000291 Metabolomics Workbench study.'----
# knitr::include_graphics("figure/ST000291_names.png")
## ----warning = FALSE, message = FALSE, comment = FALSE------------------------
metaboliteNamesURL <- "https://www.metabolomicsworkbench.org/data/show_metabolites_by_study.php?STUDY_ID=ST000291&SEARCH_TYPE=KNOWN&STUDY_TYPE=MS&RESULT_TYPE=1"
metaboliteNames <- metaboliteNamesURL %>%
read_html() %>%
html_nodes(".datatable")
metaboliteNames_negative <- metaboliteNames %>%
.[[1]] %>%
html_table() %>%
dplyr::select(`Metabolite Name`, PubChemCompound_ID, `Kegg Id`)
metaboliteNames_positive <- metaboliteNames %>%
.[[2]] %>%
html_table() %>%
dplyr::select(`Metabolite Name`, PubChemCompound_ID, `Kegg Id`)
metaboliteNames <- bind_rows(metaboliteNames_negative, metaboliteNames_positive) %>%
dplyr::rename(names = 1, PubChem = 2, KEGG = 3) %>%
mutate(KEGG = ifelse(KEGG == "-", "UNKNOWN", KEGG),
PubChem = ifelse(PubChem == "-", "UNKNOWN", PubChem)) %>%
filter(!duplicated(PubChem))
## ----warning = FALSE, message = FALSE, comment = FALSE------------------------
## negative mode features
features_negative <- assay(data_negative_mode) %>%
dplyr::slice(-n())
rownames(features_negative) <- rowData(data_negative_mode)$metabolite[1:(length(rowData(data_negative_mode)$metabolite)-1)]
## positive mode features
features_positive <- assay(data_positive_mode) %>%
dplyr::slice(-n())
rownames(features_positive) <- rowData(data_positive_mode)$metabolite[1:(length(rowData(data_positive_mode)$metabolite)-1)]
## combine positive and negative mode and set PubChem IDs as feature names
features <- bind_rows(features_negative, features_positive) %>%
tibble::rownames_to_column("names") %>%
right_join(metaboliteNames, by = "names") %>%
select(-names, -KEGG) %>%
tibble::column_to_rownames("PubChem")
## metadata
pdata <- colData(data_negative_mode) %>% # or "data_positive_mode". They are equal
as.data.frame() %>%
tibble::rownames_to_column("ID") %>%
mutate(Treatment = case_when(Treatment == "Baseline urine" ~ "Baseline",
Treatment == "Urine after drinking apple juice" ~ "Apple",
Treatment == "Urine after drinking cranberry juice" ~ "Cranberry"))
## ----warning = FALSE, message = FALSE, comment = FALSE------------------------
pdata$Treatment <- as.character(pdata$Treatment)
pdata$Treatment <- factor(pdata$Treatment,
levels = c("Baseline", "Apple", "Cranberry"))
data_sumexp <- PomaCreateObject(metadata = pdata, features = t(features))
data_preprocessed <- data_sumexp %>%
PomaImpute() %>% # sense group_by
PomaNorm()
## 182 features removed.
The normalized, imputed feature matrix is now:
expr_all <- assay(data_preprocessed) # rows: features (PubChem), cols: samples
sample_metadata <- colData(data_preprocessed) %>%
as.data.frame()
dim(expr_all)
## [1] 1359 45
head(expr_all[, 1:5])
## b1 b10 b11 b12 b13
## X443489 0.6947997 0.4887623 0.2874114 0.6073592 -0.8454956
## X107754 1.2285717 1.0538299 2.0280299 -1.6045545 -0.3914194
## X9543071 0.4526208 0.1373599 6.0661877 0.3424920 0.1880878
## X11011465 0.3876013 -1.1626013 1.0143621 0.6077095 0.2879577
## X5281160 0.8226910 -0.5901961 3.0649147 -0.3466212 0.4142565
## X440341 0.4704552 1.3992524 1.3624330 0.6902061 1.9507883
We focus on Baseline vs Cranberry, ignoring Apple for the main comparison.
limma_resCB <- data_preprocessed %>%
PomaLimma(contrast = "Cranberry-Baseline", adjust = "fdr") %>%
dplyr::rename(PubChemCID = feature) %>%
dplyr::mutate(PubChemCID = gsub("X", "", PubChemCID)) %>%
filter(PubChemCID != "UNKNOWN")
# show the first 10 features
limma_resCB %>%
dplyr::slice(1L:10L) %>%
kbl(row.names = FALSE, booktabs = TRUE) %>%
kable_styling(latex_options = c("striped"))
| PubChemCID | log2FC | AveExpr | t | pvalue | adj_pvalue | B |
|---|---|---|---|---|---|---|
| 54678503 | 2.916890 | 0 | 6.446862 | 1.0e-07 | 0.0000847 | 8.108057 |
| 71485 | 3.267987 | 0 | 5.853017 | 5.0e-07 | 0.0003289 | 6.194381 |
| 94214 | 2.180373 | 0 | 5.691638 | 8.0e-07 | 0.0003817 | 5.676529 |
| 5378303 | 2.414795 | 0 | 5.543843 | 1.4e-06 | 0.0004030 | 5.203836 |
| 3037 | 3.179669 | 0 | 5.463171 | 1.8e-06 | 0.0004030 | 4.946595 |
| 10178 | 3.331534 | 0 | 5.454885 | 1.9e-06 | 0.0004030 | 4.920207 |
| 443071 | 3.615367 | 0 | 5.402912 | 2.3e-06 | 0.0004030 | 4.754851 |
| 5486800 | -1.784212 | 0 | -5.388792 | 2.4e-06 | 0.0004030 | 4.709975 |
| 17531 | 2.262358 | 0 | 5.318449 | 3.0e-06 | 0.0004550 | 4.486739 |
| 3035199 | 1.970513 | 0 | 5.048492 | 7.5e-06 | 0.0010196 | 3.635733 |
# save(limma_res, file="data/MW_ST000291-limma_res.Rda")
In order to verify some calculations we wish to compare the results with those obtained with MetaboAnalyst. This requires metabolites to have either KEGG or HMDB identifiers.
Pubchem IDs,converted to other systems are available from file
MW_ST000291__map.csv
MW_map <- readr::read_csv("data/MW_ST000291__map.csv", show_col_types = FALSE) %>%
janitor::clean_names() %>%
mutate(
PubChemCID = as.character(query), # alinyem nom
HMDB = ifelse(hmdb == "" | is.na(hmdb), NA, hmdb)
) %>%
select(PubChemCID, HMDB)
head(MW_map)
First we filter out all metabolites not mapped to HMDB
expr_all <- assay(data_preprocessed) # rows = PubChem, cols = samples
metadata <- colData(data_preprocessed) %>% as.data.frame()
pubchem_ids <- gsub("^X", "", rownames(expr_all))
map_expr <- tibble(
PubChemCID = pubchem_ids
) %>%
left_join(MW_map, by = "PubChemCID")
keep <- !is.na(map_expr$HMDB)
data_preprocessed_HMDB <- data_preprocessed[keep, ]
head(assay(data_preprocessed_HMDB))
## b1 b10 b11 b12 b13 b14
## X1493 -0.3857251 0.5896296 0.54742922 0.1164362 -0.47291789 -0.4337673
## X439287 0.8480117 0.6384995 1.19732710 0.9622080 0.33633347 1.0182369
## X11954071 1.7701857 -1.8913284 -0.25736257 1.9016509 -2.07200711 1.0347154
## X955 0.1893927 0.2560652 0.45048446 -0.7754898 3.13547259 0.1031689
## X439435 0.5253812 0.5872576 1.11729290 1.5091126 0.37094978 0.2745189
## X5280417 0.5115108 0.3041352 0.07776652 0.8087114 0.02411856 0.2849774
## b15 b16 b17 b2 b4 b6
## X1493 -0.50339528 -0.5980115 -1.264429067 1.02806969 1.7935068 -1.06711263
## X439287 1.14709740 -1.1610238 -0.530515703 1.05994669 0.3079829 -0.57782609
## X11954071 -0.81381238 -0.1129643 -6.590653562 0.38510617 2.8580052 -0.18370515
## X955 -0.41612685 -1.9867349 -0.005223076 0.09744963 -0.4141981 0.02609954
## X439435 0.69287277 -0.3734512 -0.943917161 0.33415231 0.8340301 0.16946371
## X5280417 0.07079359 -1.2176732 -1.986746348 0.55775455 -0.6177386 -0.22425248
## b7 b8 b9 a1 a10 a11
## X1493 -1.4476851 -0.2204205 -1.5104792 0.65170919 -0.87747074 1.39380953
## X439287 -0.5268649 0.9331002 -1.0162866 -1.60633724 -0.45853608 -0.12101487
## X11954071 0.9241943 1.4818335 0.6469588 -1.29141809 -1.54597449 2.44324527
## X955 -0.3763935 0.4011452 -0.4778704 -0.22378193 0.18099625 -0.08748566
## X439435 -0.3964235 0.2480054 -0.8538239 -1.96555512 -0.06216092 0.86879922
## X5280417 -0.9627714 -0.5529711 -0.8311039 -0.08952971 2.97574517 3.17851086
## a12 a13 a14 a15 a16 a17
## X1493 2.1569911 -1.7455285 -0.6923914 0.2035022 -0.5004495 0.82366064
## X439287 1.3242346 -0.5063997 -0.4415263 0.6427600 -2.1637777 -1.57477381
## X11954071 0.4952610 1.7717785 -2.7647214 -1.0158740 -0.5289327 0.02863751
## X955 -0.4799279 0.4663974 -0.5772599 -0.8356482 -2.9098726 -0.34177311
## X439435 1.7223901 0.3047137 -0.7981083 0.1694637 -1.0097961 -1.76604313
## X5280417 1.0528224 -0.5122198 -0.1817010 0.2425141 -0.8235913 -0.24878124
## a2 a4 a6 a7 a8 a9
## X1493 1.5290714 -0.03154813 -0.9065260 0.05353628 1.3642407 -0.4902117
## X439287 1.3672212 0.54039379 -0.6789370 -0.02407401 -0.5244376 -0.2179557
## X11954071 2.0362327 0.80543563 -2.5731447 1.05936330 2.3560964 1.4571594
## X955 -0.2997101 0.27541056 0.3402795 -0.24953423 0.1312785 -0.3970065
## X439435 0.7141060 0.34654658 -0.7765412 0.01660204 0.1646760 -1.6639785
## X5280417 0.2625771 -0.82640131 -0.7717874 0.32024119 3.3518645 -0.9837162
## c1 c10 c11 c12 c13
## X1493 0.45684713 0.224058670 0.1895424 1.6307984 -1.88322973
## X439287 0.55888579 0.432807707 -0.1640363 1.4969648 -0.51118131
## X11954071 -1.89249794 -1.101837458 1.2320961 -0.2896689 0.86798501
## X955 -0.08610293 0.450484459 0.5276003 0.4504845 2.76915890
## X439435 -0.73455888 -0.005368297 0.6170156 1.1856602 0.54637070
## X5280417 0.10175942 0.413717719 -0.2291056 -0.8245270 0.09155411
## c14 c15 c16 c17 c2 c4
## X1493 1.05632816 -1.0861729 -0.1924996 0.85524923 1.03947573 -1.41584273
## X439287 0.95258966 0.4681795 0.8409154 -0.52080656 -0.05545967 -1.00316823
## X11954071 -2.47134436 -0.2085014 -0.9126684 -0.28447523 1.37902016 0.52460407
## X955 2.45478738 0.0695257 -1.6123407 -0.60726340 0.13238643 0.17994164
## X439435 -0.03342162 1.0078244 -0.1279754 -0.96630600 0.49679919 -0.37345123
## X5280417 0.85717358 0.3744679 -1.7591859 -0.03289151 0.14147323 0.06376681
## c6 c7 c8 c9
## X1493 1.1077750 -0.8492909 1.4392574 -1.6758193
## X439287 -0.4827946 -1.3680218 -0.4619694 -0.3759711
## X11954071 0.6682649 -2.5308987 1.3248235 1.8811378
## X955 0.7755421 -0.4141981 -0.7480831 0.4584730
## X439435 0.4675047 -0.2858048 -1.2779528 -0.8768713
## X5280417 -0.3596963 -1.8702919 0.4490377 -0.6103103
hmdb_ids <- map_expr$HMDB[keep]
rownames(data_preprocessed)[keep] <- hmdb_ids
data_preprocessed_HMDB <- data_preprocessed[keep, ]
head(assay(data_preprocessed_HMDB))
## b1 b10 b11 b12 b13 b14
## HMDB0245462 -0.3857251 0.5896296 0.54742922 0.1164362 -0.47291789 -0.4337673
## HMDB0001351 0.8480117 0.6384995 1.19732710 0.9622080 0.33633347 1.0182369
## HMDB0062198 1.7701857 -1.8913284 -0.25736257 1.9016509 -2.07200711 1.0347154
## HMDB0304087 0.1893927 0.2560652 0.45048446 -0.7754898 3.13547259 0.1031689
## HMDB0003503 0.5253812 0.5872576 1.11729290 1.5091126 0.37094978 0.2745189
## HMDB0029263 0.5115108 0.3041352 0.07776652 0.8087114 0.02411856 0.2849774
## b15 b16 b17 b2 b4
## HMDB0245462 -0.50339528 -0.5980115 -1.264429067 1.02806969 1.7935068
## HMDB0001351 1.14709740 -1.1610238 -0.530515703 1.05994669 0.3079829
## HMDB0062198 -0.81381238 -0.1129643 -6.590653562 0.38510617 2.8580052
## HMDB0304087 -0.41612685 -1.9867349 -0.005223076 0.09744963 -0.4141981
## HMDB0003503 0.69287277 -0.3734512 -0.943917161 0.33415231 0.8340301
## HMDB0029263 0.07079359 -1.2176732 -1.986746348 0.55775455 -0.6177386
## b6 b7 b8 b9 a1
## HMDB0245462 -1.06711263 -1.4476851 -0.2204205 -1.5104792 0.65170919
## HMDB0001351 -0.57782609 -0.5268649 0.9331002 -1.0162866 -1.60633724
## HMDB0062198 -0.18370515 0.9241943 1.4818335 0.6469588 -1.29141809
## HMDB0304087 0.02609954 -0.3763935 0.4011452 -0.4778704 -0.22378193
## HMDB0003503 0.16946371 -0.3964235 0.2480054 -0.8538239 -1.96555512
## HMDB0029263 -0.22425248 -0.9627714 -0.5529711 -0.8311039 -0.08952971
## a10 a11 a12 a13 a14 a15
## HMDB0245462 -0.87747074 1.39380953 2.1569911 -1.7455285 -0.6923914 0.2035022
## HMDB0001351 -0.45853608 -0.12101487 1.3242346 -0.5063997 -0.4415263 0.6427600
## HMDB0062198 -1.54597449 2.44324527 0.4952610 1.7717785 -2.7647214 -1.0158740
## HMDB0304087 0.18099625 -0.08748566 -0.4799279 0.4663974 -0.5772599 -0.8356482
## HMDB0003503 -0.06216092 0.86879922 1.7223901 0.3047137 -0.7981083 0.1694637
## HMDB0029263 2.97574517 3.17851086 1.0528224 -0.5122198 -0.1817010 0.2425141
## a16 a17 a2 a4 a6
## HMDB0245462 -0.5004495 0.82366064 1.5290714 -0.03154813 -0.9065260
## HMDB0001351 -2.1637777 -1.57477381 1.3672212 0.54039379 -0.6789370
## HMDB0062198 -0.5289327 0.02863751 2.0362327 0.80543563 -2.5731447
## HMDB0304087 -2.9098726 -0.34177311 -0.2997101 0.27541056 0.3402795
## HMDB0003503 -1.0097961 -1.76604313 0.7141060 0.34654658 -0.7765412
## HMDB0029263 -0.8235913 -0.24878124 0.2625771 -0.82640131 -0.7717874
## a7 a8 a9 c1 c10
## HMDB0245462 0.05353628 1.3642407 -0.4902117 0.45684713 0.224058670
## HMDB0001351 -0.02407401 -0.5244376 -0.2179557 0.55888579 0.432807707
## HMDB0062198 1.05936330 2.3560964 1.4571594 -1.89249794 -1.101837458
## HMDB0304087 -0.24953423 0.1312785 -0.3970065 -0.08610293 0.450484459
## HMDB0003503 0.01660204 0.1646760 -1.6639785 -0.73455888 -0.005368297
## HMDB0029263 0.32024119 3.3518645 -0.9837162 0.10175942 0.413717719
## c11 c12 c13 c14 c15 c16
## HMDB0245462 0.1895424 1.6307984 -1.88322973 1.05632816 -1.0861729 -0.1924996
## HMDB0001351 -0.1640363 1.4969648 -0.51118131 0.95258966 0.4681795 0.8409154
## HMDB0062198 1.2320961 -0.2896689 0.86798501 -2.47134436 -0.2085014 -0.9126684
## HMDB0304087 0.5276003 0.4504845 2.76915890 2.45478738 0.0695257 -1.6123407
## HMDB0003503 0.6170156 1.1856602 0.54637070 -0.03342162 1.0078244 -0.1279754
## HMDB0029263 -0.2291056 -0.8245270 0.09155411 0.85717358 0.3744679 -1.7591859
## c17 c2 c4 c6 c7
## HMDB0245462 0.85524923 1.03947573 -1.41584273 1.1077750 -0.8492909
## HMDB0001351 -0.52080656 -0.05545967 -1.00316823 -0.4827946 -1.3680218
## HMDB0062198 -0.28447523 1.37902016 0.52460407 0.6682649 -2.5308987
## HMDB0304087 -0.60726340 0.13238643 0.17994164 0.7755421 -0.4141981
## HMDB0003503 -0.96630600 0.49679919 -0.37345123 0.4675047 -0.2858048
## HMDB0029263 -0.03289151 0.14147323 0.06376681 -0.3596963 -1.8702919
## c8 c9
## HMDB0245462 1.4392574 -1.6758193
## HMDB0001351 -0.4619694 -0.3759711
## HMDB0062198 1.3248235 1.8811378
## HMDB0304087 -0.7480831 0.4584730
## HMDB0003503 -1.2779528 -0.8768713
## HMDB0029263 0.4490377 -0.6103103
Now the analysis can be reproduced
limma_HMDB_CB <- data_preprocessed_HMDB %>%
PomaLimma(contrast = "Cranberry-Baseline", adjust = "fdr")
head(limma_HMDB_CB)
background_HMDB <- limma_HMDB_CB$feature
selected_HMDB <- limma_HMDB_CB %>%
filter(adj_pvalue < 0.25) %>%
pull(feature)
The topTables are saved to be used later:
save(limma_resCB, limma_HMDB_CB, file="data/MW_ST000291-limma_res.Rda")
We remove unnecessary packages used onlñy in this section
detach(package:POMA)
detach(package:SummarizedExperiment)
enrichmet
package exmple dataThe enrichmetpackage has deposited its example data in
Zotero. For reproducibility we download the data directly from
there:
To run the enrichmet package, you need to provide at least the following three files:
PathwayVsMetabolitesThe mappings are derived from the KEGG database (https://www.kegg.jp/),
accessed programmatically via the KEGGREST package. To ensure
reproducibility and minimize run-time delays, curated human-specific
pathway–metabolite mappings are periodically updated and archived on
Zenodo [1]. We provide a human-specific
PathwayVsMetabolites file available here:
Human-PathwaysVsMetabolites.csv
However, users can supply a file tailored to their species of
interest.
InputMetabolitesFor demonstration purposes, we also include summary statistics derived from an example experimental dataset. These example data are intended solely to showcase the package workflow and will be described in detail in a subsequent publication.
SummaryStatisticsThis information is not generated by the package and must be provided by the user. Summary statistics may be supplied either as a standalone data frame or extracted directly from the rowData() of a SummarizedExperiment object, provided that appropriate columns are available for metabolite identifiers, p-values, and fold changes (see section 2.4 example_data).
For demonstration purposes, the package includes an example summary statistics dataset. These example data are provided solely to illustrate the workflow and will be described in detail in a forthcoming publication.
The enrichmet() function integrates both p-values and fold changes when performing MetSEA. By considering the direction and magnitude of changes, this approach yields richer biological insights than methods based only on binary classifications of “significant” versus “not significant.”
Including these optional files can enhance the analysis and its biological interpretation:
KEGGLookup
A reference file linking KEGG compound IDs (https://www.kegg.jp/) to
metabolite names or other annotations, which helps standardize
identifiers and improve pathway mapping [1]. We provide a KEGGLookup
file here:
kegg_lookup.xlsx
Users may also provide a lookup file specific to their species.
STITCHInteractions
A dataset of known or predicted interactions between metabolites or
proteins, obtained from the freely available STITCH database (http://stitch.embl.de/ ).
These interactions are useful for constructing networks to support
pathway analysis [2]. We supply an example file here:
chemical_chemical.tsv
Users may provide their own interaction data appropriate for their
species.
mapping_df
A mapping data frame that links STITCH identifiers with metabolite
names, facilitating integration between datasets [2]. The provided
example file is available here:
mapping_df.xlsx
Users can substitute this with a mapping file relevant to their species
of interest.
We used summary statistics from differential analyses based on metabolomic measurements comparing KrasG12D mutations with and without TAp73 deletion. The top 50 metabolites, ranked by significance, were selected for pathway enrichment analysis using Fisher’s Exact Test. These summary statistics were also utilized for MetSEA analysis.
# InputMetabolites: A vector or list containing the metabolites of interest.
inputMetabolites <- c(
"C02721", "C03736", "C03139", "C00074", "C02862", "C00092", "C00275",
"C00386", "C10172", "C00361", "C00242", "C19806", "C16358", "C16357",
"C16353", "C02140", "C05488", "C01181", "C03017", "C02632", "C00246",
"C00079", "C08317", "C00588", "C16527", "C00065", "C00249", "C00836",
"C02990", "C03690", "C04025", "C06424", "C00123", "C00606", "C03067",
"C00539", "C00633", "C00519", "C00366", "C00148", "C00219", "C08261",
"C00385", "C06429", "C05637", "C00295", "C06428", "C16513", "C01909",
"C03299", "C01530", "C01571", "C08278", "C00599", "C15587", "C00144",
"C03739"
)
# Standard cache location
cache_dir <- tools::R_user_dir("enrichmet", "cache")
bfc <- BiocFileCache(cache_dir, ask = FALSE)
# Helper function to download & cache any file from Zenodo
get_cached_file <- function(url) {
bfcrpath(bfc, url)
}
# -------------------------
# Load files with caching
# -------------------------
# PathwayVsMetabolites
pathway_url <- "https://zenodo.org/api/records/15498097/files/Human-PathwaysVsMetabolites.csv/content"
pathway_path <- get_cached_file(pathway_url)
PathwayVsMetabolites <- read.csv(pathway_path)
str(PathwayVsMetabolites)
## 'data.frame': 351 obs. of 2 variables:
## $ Pathway : chr "Glycolysis / Gluconeogenesis" "Citrate cycle (TCA cycle)" "Pentose phosphate pathway" "Pentose and glucuronate interconversions" ...
## $ Metabolites: chr "C00022,C00024,C00031,C00033,C00036,C00068,C00074,C00084,C00085,C00103,C00111,C00118,C00186,C00197,C00221,C00236"| __truncated__ "C00022,C00024,C00026,C00036,C00042,C00068,C00074,C00091,C00122,C00149,C00158,C00311,C00417,C05125,C05379,C05381"| __truncated__ "C00022,C00031,C00085,C00117,C00118,C00119,C00121,C00197,C00198,C00199,C00204,C00221,C00231,C00257,C00258,C00279"| __truncated__ "C00022,C00026,C00029,C00103,C00111,C00116,C00167,C00181,C00191,C00199,C00204,C00216,C00231,C00259,C00266,C00309"| __truncated__ ...
# SummaryStatistics
example_url <- "https://zenodo.org/api/records/15498097/files/example_data.xlsx/content"
example_path <- get_cached_file(example_url)
example_data <- read_excel(example_path)
show(example_data[1:3,])
## # A tibble: 3 × 4
## met_id pval padj log2fc
## <chr> <dbl> <dbl> <dbl>
## 1 C02721 0.0000124 0.00315 -2.32
## 2 C03736 0.0000694 0.00739 2.63
## 3 C03139 0.0000869 0.00739 1.75
# KEGGLookup
kegg_url <- "https://zenodo.org/api/records/15498097/files/kegg_lookup.xlsx/content"
kegg_path <- get_cached_file(kegg_url)
kegg_lookup <- read_excel(kegg_path)
head(kegg_lookup)
# Mapping_df
mapping_url <- "https://zenodo.org/api/records/15498097/files/mapping_df.xlsx/content"
mapping_path <- get_cached_file(mapping_url)
mapping_df <- read_excel(mapping_path)
head(mapping_df)
# STITCHInteractions
# stitch_url <- "https://zenodo.org/records/15498097/files/chemical_chemical.tsv"
# stitch_path <- get_cached_file(stitch_url)
# stitch_df <- read_tsv(stitch_path)
# head(stitch_df)
The data from “EnrichMetExample” is not an EnrichmentSet, so a new
one is created from the PathwayvsMetabolites
data.frame:
First we prepare the long version of
PathwayVsMetabolites:
tmp <- PathwayVsMetabolites %>%
mutate(Metabolites = strsplit(Metabolites, ","))
unnested <- tidyr::unnest(tmp, cols = c(Metabolites))
PathwayVsMetabolites_long <- dplyr::rename(unnested, kegg_id = Metabolites)
head(PathwayVsMetabolites_long)
PathwayVsMetabolites_long <- PathwayVsMetabolites_long %>%
mutate(PathwayID = paste0("PW_", as.integer(as.factor(Pathway))))
head(PathwayVsMetabolites_long)
PathwayVsMetabolites_long <- PathwayVsMetabolites_long %>%
left_join(kegg_lookup, by = c("kegg_id" = "kegg_id"))
## Warning in left_join(., kegg_lookup, by = c(kegg_id = "kegg_id")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 132 of `x` matches multiple rows in `y`.
## ℹ Row 159 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
head(PathwayVsMetabolites_long)
With this we can directly build the EnrichmentSet
object:
require(localEnrichment)
## Cargando paquete requerido: localEnrichment
KEGGset_KEGG <- buildEnrichmentSet(
data = PathwayVsMetabolites_long,
id_col = "kegg_id",
category_col = "Pathway",
set_id_col = "PathwayID",
set_name = "KEGG_hsa_pathways",
source = "KEGG",
species = "Homo sapiens",
version = Sys.Date(),
description = "KEGG human pathways mapped to KEGG compound IDs",
sep = ";"
)
summary(KEGGset_KEGG)
## EnrichmentSet summary:
## Mapping name: KEGG_hsa_pathways
## Source: KEGG
## Feature IDs: kegg_id
## Number of sets: 351
## Mean set size: 16.37607 features
## Median set size: 6 features
head(KEGGset_KEGG@data)
All the data is saved as a binary file to be used later
EM_inputMetabolites <- inputMetabolites
EM_summaryStats <- example_data
EM_PathwayVsMetabolites<- PathwayVsMetabolites
EM_mapping_df <- mapping_df
# EM_stitch_df <- stitch_df
EM_kegg_lookup <- kegg_lookup
save(EM_inputMetabolites, KEGGset_KEGG, EM_PathwayVsMetabolites, EM_summaryStats, EM_kegg_lookup, EM_mapping_df, file="data/enrichmet_example.Rda")