MAINHEALTH: intergenerational obesity transmission

Introduction

This article presents the CP results of the MAINHEALTH dataset discussed in Chapter 5 of my dissertation.

Preamble

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(tidyr)
library(parafac4microbiome)
library(NPLStoolbox)
library(CMTFtoolbox)

## 
## Attaching package: 'CMTFtoolbox'

## The following object is masked from 'package:NPLStoolbox':
## 
##     npred

## The following objects are masked from 'package:parafac4microbiome':
## 
##     fac_to_vect, reinflateFac, reinflateTensor, vect_to_fac

library(ggplot2)
library(ggpubr)
library(stringr)
library(scales)
library(ggpattern)

testMetadata = function(model, comp, metadata){
  transformedSubjectLoadings = model$Fac[[1]][,comp]
  transformedSubjectLoadings = transformedSubjectLoadings / norm(transformedSubjectLoadings, "2")
  
  result = lm(transformedSubjectLoadings ~ BMI + C.section + Secretor + Lewis + whz.6m, data=metadata$mode1)
  
  # Extract coefficients and confidence intervals
  coef_estimates <- summary(result)$coefficients
  conf_intervals <- confint(result)
  
  # Remove intercept
  coef_estimates <- coef_estimates[rownames(coef_estimates) != "(Intercept)", ]
  conf_intervals <- conf_intervals[rownames(conf_intervals) != "(Intercept)", ]
  
  # Combine into a clean data frame
  summary_table <- data.frame(
    Term     = rownames(coef_estimates),
    Estimate = coef_estimates[, "Estimate"] * 1e3,
    CI       = paste0(
                 conf_intervals[, 1], " – ",
                 conf_intervals[, 2]
               ),
    P_value  = coef_estimates[, "Pr(>|t|)"],
    P_adjust = p.adjust(coef_estimates[, "Pr(>|t|)"], "BH"),
    row.names = NULL
  )
  
  return(summary_table)
}

colours = RColorBrewer:: brewer.pal(8, "Dark2")
BMI_cols = c("darkgreen","darkgoldenrod","darkred") #colours[1:3]
WHZ_cols = c("darkgreen", "darkgoldenrod", "dodgerblue4")
phylum_cols = colours

processedFaeces = parafac4microbiome::processDataCube(NPLStoolbox::Jakobsen2025$faeces, sparsityThreshold = 0.75, centerMode=1, scaleMode=2)
processedMilk = parafac4microbiome::processDataCube(NPLStoolbox::Jakobsen2025$milkMicrobiome, sparsityThreshold=0.85, centerMode=1, scaleMode=2)
processedMilkMetab = parafac4microbiome::processDataCube(NPLStoolbox::Jakobsen2025$milkMetabolomics, CLR=FALSE, centerMode=1, scaleMode=2)

colourCols = c("", "", "")
legendTitles = c("BMI group", "Phylum", "")
xLabels = c("Subject index", "Feature index", "Time index")
legendColNums = c(3,5,0)
arrangeModes = c(TRUE, TRUE, FALSE)
continuousModes = c(FALSE,FALSE,TRUE)

Faecal microbiome

CP was applied to capture the dominant source of variation in the infant faecal microbiome data block. The model selection procedure revealed that CORCONDIA was inconclusive and loadings were somewhat unstable at three components. The commented code below performs the procedure, and is not rerun here due to computational limitations.

# assessment_faeces = parafac4microbiome::assessModelQuality(processedFaeces$data, numRepetitions = 10, numCores = 10)
# assessment_faeces$metrics$CORCONDIA %>% as_tibble() %>% mutate(index=1:nrow(.)) %>% pivot_longer(-index) %>% ggplot(aes(x=as.factor(name),y=value)) + geom_boxplot() + xlab("Number of components") + ylab("CORCONDIA") + theme(text=element_text(size=16))

# stability_faeces = parafac4microbiome::assessModelStability(processedFaeces, maxNumComponents=3, numFolds=20, colourCols=colourCols, legendTitles=legendTitles, xLabels = xLabels, legendColNums=legendColNums, arrangeModes=arrangeModes, numCores=parallel::detectCores())
# stability_faeces$modelPlots[[2]]
# stability_faeces$modelPlots[[3]]
#

# cp_faeces = parafac4microbiome::parafac(processedFaeces$data, nfac=3, nstart = 100)
cp_faeces = readRDS("./MAINHEALTH/cp_faeces.RDS")

cp_faeces$varExp

## [1] 15.17823

Therefore, a two-component CP model was selected, explaining 15.2% of the variation. However, MLR analysis revealed that neither component could be interpreted.

testMetadata(cp_faeces, 1, processedFaeces)

##        Term   Estimate                                         CI    P_value
## 1       BMI   1.557682 -0.00109277376368493 – 0.00420813678843941 0.24698916
## 2 C.section -17.096372   -0.0655708277484862 – 0.0313780847296313 0.48646498
## 3  Secretor  11.375601   -0.0221113148908448 – 0.0448625169392564 0.50262459
## 4     Lewis -86.360039   -0.181248173947069 – 0.00852809560447294 0.07407419
## 5    whz.6m  -3.321885   -0.0160020917474101 – 0.0093583227214069 0.60504067
##    P_adjust
## 1 0.6050407
## 2 0.6050407
## 3 0.6050407
## 4 0.3703709
## 5 0.6050407

testMetadata(cp_faeces, 2, processedFaeces)

##        Term   Estimate                                          CI    P_value
## 1       BMI   2.722216 -4.56020479994157e-06 – 0.00544899180328022 0.05037851
## 2 C.section  46.004176     -0.003866118532671 – 0.0958744696412498 0.07028311
## 3  Secretor  -7.108612      -0.041559794510577 – 0.027342570748067 0.68370085
## 4     Lewis -60.366832     -0.157987302185625 – 0.0372536372194021 0.22330820
## 5    whz.6m   1.450616    -0.0115947217747149 – 0.0144959541369427 0.82617195
##    P_adjust
## 1 0.1757078
## 2 0.1757078
## 3 0.8261720
## 4 0.3721803
## 5 0.8261720

HM microbiome

CP was applied to capture the dominant source of variation in the HM microbiome data block. The model selection procedure revealed that CORCONDIA was inconclusive and loadings were unstable at three components. The commented code below performs the procedure, and is not rerun here due to computational limitations.

# assessment_milk = parafac4microbiome::assessModelQuality(processedMilk$data, numRepetitions = 10, numCores = 10)
# assessment_milk$metrics$CORCONDIA %>% as_tibble() %>% mutate(index=1:nrow(.)) %>% pivot_longer(-index) %>% ggplot(aes(x=as.factor(name),y=value)) + geom_boxplot() + xlab("Number of components") + ylab("CORCONDIA") + theme(text=element_text(size=16))

# stability_milk = parafac4microbiome::assessModelStability(processedMilk, maxNumComponents=3, numFolds=10, colourCols=colourCols, legendTitles=legendTitles, xLabels = xLabels, legendColNums=legendColNums, arrangeModes=arrangeModes, numCores=parallel::detectCores())
# stability_milk$modelPlots[[2]]
# stability_milk$modelPlots[[3]]
#

# cp_milk = parafac4microbiome::parafac(processedMilk$data, nfac=2, nstart = 100)
cp_milk = readRDS("./MAINHEALTH/cp_milk.RDS")

cp_milk$varExp

## [1] 13.17473

Therefore, a two-component CP model was selected, explaining 13.2% of the variation. However, MLR analysis revealed that the first CP component was uninterpretable, while the second captured variation only weakly associated with maternal ppBMI.

testMetadata(cp_milk, 1, processedMilk)

##        Term    Estimate                                         CI   P_value
## 1       BMI  -0.7971158 -0.00356908227805278 – 0.00197485067980183 0.5702715
## 2 C.section -27.2768492   -0.0779905182386613 – 0.0234368199177587 0.2891367
## 3  Secretor   4.8853309    -0.0301641355434441 – 0.039934797247701 0.7830997
## 4     Lewis  28.0599419     -0.071151776413022 – 0.127271660144112 0.5766273
## 5    whz.6m   2.3991703    -0.0108741423856309 – 0.015672483059881 0.7211329
##    P_adjust
## 1 0.7830997
## 2 0.7830997
## 3 0.7830997
## 4 0.7830997
## 5 0.7830997

testMetadata(cp_milk, 2, processedMilk)

##        Term   Estimate                                         CI    P_value
## 1       BMI   2.942392 0.000305899155017146 – 0.00557888463379954 0.02901701
## 2 C.section  25.569124   -0.0226660263261727 – 0.0738042746886722 0.29612537
## 3  Secretor   6.165320   -0.0271711809286324 – 0.0395018213417433 0.71494998
## 4     Lewis -74.787457    -0.169150421245261 – 0.0195755080850167 0.11927081
## 5    whz.6m  -3.204212  -0.0158288212471368 – 0.00942039651018406 0.61630886
##    P_adjust
## 1 0.1450850
## 2 0.4935423
## 3 0.7149500
## 4 0.2981770
## 5 0.7149500

topIndices = processedMilk$mode2 %>% mutate(index=1:nrow(.), Comp = cp_milk$Fac[[2]][,2]) %>% arrange(desc(Comp)) %>% head() %>% select(index) %>% pull()
bottomIndices = processedMilk$mode2 %>% mutate(index=1:nrow(.), Comp = cp_milk$Fac[[2]][,2]) %>% arrange(desc(Comp)) %>% tail() %>% select(index) %>% pull()

Xhat = parafac4microbiome::reinflateTensor(cp_milk$Fac[[1]][,2], cp_milk$Fac[[2]][,2], cp_milk$Fac[[3]][,2])

print("Positive loadings:")

## [1] "Positive loadings:"

print(cor(Xhat[,topIndices[1],1], processedMilk$mode1$BMI)) # no flip

## [1] 0.1705408

print("Negative loadings:")

## [1] "Negative loadings:"

print(cor(Xhat[,bottomIndices[1],1], processedMilk$mode1$BMI))

## [1] -0.1705408

a = processedMilk$mode1 %>%
  mutate(Component_2 = cp_milk$Fac[[1]][,2]) %>%
  ggplot(aes(x=BMI,y=Component_2)) +
  geom_point() +
  stat_cor() +
  xlab("ppBMI") +
  ylab("Loading") +
  theme(text=element_text(size=16))

df = processedMilk$mode2 %>%
  mutate(Component_2 = cp_milk$Fac[[2]][,2]) %>%
  arrange(Component_2) %>%
  mutate(index=1:nrow(.)) %>%
  mutate(Genus = gsub("g__", "", V7), Species = gsub("s__", "", V8)) %>% 
  mutate(Species = str_split_fixed(Species, "_", 2)[,2]) %>% 
  mutate(dplyr::across(.cols = Species, .fns = ~ dplyr::if_else(stringr::str_detect(.x, "sp."), "", .x))) %>%
  mutate(dplyr::across(.cols = Species, .fns = ~ dplyr::if_else(stringr::str_detect(.x, "organism"), "", .x))) %>%
  mutate(dplyr::across(.cols = Species, .fns = ~ dplyr::if_else(stringr::str_detect(.x, "bacterium"), "", .x))) %>%
  filter(index %in% c(1:10, 106:115))

df[df$Genus == "" & df$Species == "", "Species"] = "Unknown"
df[df$Species == "", "Species"] = "sp."
df$name = paste0(df$Genus, " ", df$Species)
  
b = df %>% ggplot(aes(x=Component_2,y=as.factor(index),fill=as.factor(V3))) +
  geom_bar(stat="identity", col="black") +
  xlab("Loading") +
  ylab("") +
  scale_y_discrete(labels=df$name) +
  scale_fill_manual(name="Phylum", values=hue_pal()(5)[-c(2,5)], labels=c("Actinobacteriota", "Firmicutes", "Proteobacteria")) +
  theme(text=element_text(size=16))

c = cp_milk$Fac[[3]][,2] %>%
  as_tibble() %>% 
  mutate(value=value) %>% 
  ggplot(aes(x=c(0,30,60,90),y=value)) +
  geom_line() +
  geom_point() +
  xlab("Time point [day]") +
  ylab("Loading") +
  ylim(0,1) +
  theme(text=element_text(size=16))

a

HM metabolome

CP was applied to capture the dominant source of variation in the HM metabolomics data block. The model selection procedure revealed that CORCONDIA values were too low at four components. The commented code below performs the procedure, and is not rerun here due to computational limitations.

# assessment_milkMetab = parafac4microbiome::assessModelQuality(processedMilkMetab$data, numRepetitions = 10, numCores = 10)
# 
# assessment_milkMetab$metrics$CORCONDIA %>% as_tibble() %>% mutate(index=1:nrow(.)) %>% pivot_longer(-index) %>% ggplot(aes(x=as.factor(name),y=value)) + geom_boxplot() + xlab("Number of components") + ylab("CORCONDIA") + theme(text=element_text(size=16))

# stability_milkMetab = parafac4microbiome::assessModelStability(processedMilkMetab, maxNumComponents=4, numFolds=10, colourCols=c("", "", ""), legendTitles=c("BMI group", "Class", ""), xLabels = xLabels, legendColNums=legendColNums, arrangeModes=arrangeModes, numCores=parallel::detectCores())
# stability_milkMetab$modelPlots[[3]]
# stability_milkMetab$modelPlots[[4]]

# cp_milkMetab = parafac4microbiome::parafac(processedMilkMetab$data, nfac=3, nstart = 100)
cp_milkMetab = readRDS("./MAINHEALTH/cp_milkMetab.RDS")

cp_milkMetab$varExp

## [1] 21.55482

Therefore, a three-component CP model was selected, explaining 21.6% of the variation. MLR analysis revealed that the first and third components were uninterpretable, while the second component captured variation exclusively and strongly associated with maternal secretor status.

testMetadata(cp_milkMetab, 1, processedMilkMetab)

##        Term  Estimate                                          CI    P_value
## 1       BMI -2.583661 -0.00535718628987143 – 0.000189863674592802 0.06760372
## 2 C.section -9.956381    -0.0605921878622612 – 0.0406794268355845 0.69782750
## 3  Secretor  9.252236    -0.0261442697993304 – 0.0446487419625543 0.60584696
## 4     Lewis 92.519158     -0.0281341347345387 – 0.213172450952438 0.13163311
## 5    whz.6m 10.497357   -0.00275014531882957 – 0.0237448598441314 0.11934771
##    P_adjust
## 1 0.2193885
## 2 0.6978275
## 3 0.6978275
## 4 0.2193885
## 5 0.2193885

testMetadata(cp_milkMetab, 2, processedMilkMetab)

##        Term    Estimate                                        CI      P_value
## 1       BMI   0.1116855 -0.00080493346242961 – 0.0010283045137446 8.098366e-01
## 2 C.section  11.0721065 -0.00566246154228743 – 0.0278066745141615 1.927839e-01
## 3  Secretor 177.1666463     0.165468496949462 – 0.188864795602086 6.443800e-59
## 4     Lewis -37.4429441 -0.0773175080477337 – 0.00243161979792261 6.545970e-02
## 5    whz.6m   1.6165731 -0.0027615783945977 – 0.00599472449699943 4.662904e-01
##       P_adjust
## 1 8.098366e-01
## 2 3.213065e-01
## 3 3.221900e-58
## 4 1.636492e-01
## 5 5.828630e-01

testMetadata(cp_milkMetab, 3, processedMilkMetab)

##        Term   Estimate                                          CI   P_value
## 1       BMI  -1.716400 -0.00415552494908338 – 0.000722724998462952 0.1661829
## 2 C.section -22.466123    -0.0669968421278869 – 0.0220645963398689 0.3199733
## 3  Secretor  -6.672137    -0.0378009361212165 – 0.0244566617238421 0.6721456
## 4     Lewis -22.466704     -0.128572999236914 – 0.0836395906101599 0.6758950
## 5    whz.6m   3.066525   -0.00858374497479842 – 0.0147167946587828 0.6033337
##   P_adjust
## 1 0.675895
## 2 0.675895
## 3 0.675895
## 4 0.675895
## 5 0.675895

topIndices = processedMilkMetab$mode2 %>% mutate(index=1:nrow(.), Comp = cp_milkMetab$Fac[[2]][,2]) %>% arrange(desc(Comp)) %>% head() %>% select(index) %>% pull()
bottomIndices = processedMilkMetab$mode2 %>% mutate(index=1:nrow(.), Comp = cp_milkMetab$Fac[[2]][,2]) %>% arrange(desc(Comp)) %>% tail() %>% select(index) %>% pull()

Xhat = parafac4microbiome::reinflateTensor(cp_milkMetab$Fac[[1]][,2], cp_milkMetab$Fac[[2]][,2], cp_milkMetab$Fac[[3]][,2])

print("Positive loadings:")

## [1] "Positive loadings:"

print(cor(Xhat[,topIndices[1],1], processedMilkMetab$mode1$Secretor)) # flip

## [1] -0.9257706

print("Negative loadings:")

## [1] "Negative loadings:"

print(cor(Xhat[,bottomIndices[1],1], processedMilkMetab$mode1$Secretor))

## [1] 0.9257706

a = processedMilkMetab$mode1 %>%
  mutate(Component_2 = cp_milkMetab$Fac[[1]][,2]) %>%
  ggplot(aes(x=as.factor(Secretor),y=Component_2)) +
  geom_boxplot() +
  stat_compare_means() +
  xlab("Secretor status") +
  ylab("Loading") +
  scale_x_discrete(labels=c("Se-", "Se+")) +
  theme(text=element_text(size=16))

temp = processedMilkMetab$mode2 %>%
  mutate(Component_2 = -1*cp_milkMetab$Fac[[2]][,2]) %>%
  arrange(Component_2) %>%
  mutate(index=1:nrow(.)) %>%
  filter(index %in% c(1:10, 61:70))

b=temp %>%
  ggplot(aes(x=Component_2,y=as.factor(index),fill=as.factor(Class),pattern=as.factor(Class))) +
  geom_bar_pattern(stat="identity",
                   colour="black",
                   pattern="stripe",
                   pattern_fill="black",
                   pattern_density=0.2,
                   pattern_spacing=0.05,
                   pattern_angle=45,
                   pattern_size=0.2) +
  scale_y_discrete(label=temp$Metabolite) +
  xlab("Loading") +
  ylab("") +
  scale_fill_manual(name="Class", values=hue_pal()(7)[-c(4,5)], labels=c("Amino acids and derivatives", "Energy related", "Fatty acids and derivatives", "Oligosaccharides", "Sugars")) +
  theme(text=element_text(size=16))

c = cp_milkMetab$Fac[[3]][,2] %>%
  as_tibble() %>% 
  mutate(value=value*-1) %>% 
  ggplot(aes(x=c(0,30,60,90),y=value)) +
  geom_line() +
  geom_point() +
  xlab("Time point [day]") +
  ylab("Loading") +
  ylim(0,1) +
  theme(text=element_text(size=16))

a

2025-08-31

Introduction

Preamble

Faecal microbiome

HM microbiome

HM metabolome