Skip to contents

Preamble

## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
## 
## Attaching package: 'CMTFtoolbox'
## The following object is masked from 'package:NPLStoolbox':
## 
##     npred
## The following objects are masked from 'package:parafac4microbiome':
## 
##     fac_to_vect, reinflateFac, reinflateTensor, vect_to_fac
testMetadata = function(model, comp, metadata){
  transformedSubjectLoadings = model$Fac[[1]][,comp]
  transformedSubjectLoadings = transformedSubjectLoadings / norm(transformedSubjectLoadings, "2")
  
  result = lm(transformedSubjectLoadings ~ BMI + C.section + Secretor + Lewis + whz.6m, data=metadata$mode1)
  
  # Extract coefficients and confidence intervals
  coef_estimates <- summary(result)$coefficients
  conf_intervals <- confint(result)
  
  # Remove intercept
  coef_estimates <- coef_estimates[rownames(coef_estimates) != "(Intercept)", ]
  conf_intervals <- conf_intervals[rownames(conf_intervals) != "(Intercept)", ]
  
  # Combine into a clean data frame
  summary_table <- data.frame(
    Term     = rownames(coef_estimates),
    Estimate = coef_estimates[, "Estimate"] * 1e3,
    CI       = paste0(
                 conf_intervals[, 1], " – ",
                 conf_intervals[, 2]
               ),
    P_value  = coef_estimates[, "Pr(>|t|)"],
    P_adjust = p.adjust(coef_estimates[, "Pr(>|t|)"], "BH"),
    row.names = NULL
  )
  
  return(summary_table)
}
# Old approach
# Needed colours: BMI groups (3) and phyla levels (6)
# colours = hue_pal()(9)
# #show_col(colours)
# BMI_cols = colours[c(1,4,7)]
# phylum_cols = colours[-c(1,4,7)]

# Rasmus's suggestion
colours = RColorBrewer:: brewer.pal(8, "Dark2")
BMI_cols = c("darkgreen","darkgoldenrod","darkred") #colours[1:3]
WHZ_cols = c("darkgreen", "darkgoldenrod", "dodgerblue4")
phylum_cols = colours
processedFaeces = parafac4microbiome::processDataCube(NPLStoolbox::Jakobsen2025$faeces, sparsityThreshold = 0.75, centerMode=1, scaleMode=2)
processedMilk = parafac4microbiome::processDataCube(NPLStoolbox::Jakobsen2025$milkMicrobiome, sparsityThreshold=0.85, centerMode=1, scaleMode=2)
processedMilkMetab = parafac4microbiome::processDataCube(NPLStoolbox::Jakobsen2025$milkMetabolomics, CLR=FALSE, centerMode=1, scaleMode=2)

Model selection

# assessment_faeces = parafac4microbiome::assessModelQuality(processedFaeces$data, numRepetitions = 10, numCores = 10)
# assessment_milk = parafac4microbiome::assessModelQuality(processedMilk$data, numRepetitions = 10, numCores = 10)
# assessment_milkMetab = parafac4microbiome::assessModelQuality(processedMilkMetab$data, numRepetitions = 10, numCores = 10)
# 
# saveRDS(assessment_faeces, "./MAINHEALTH/assessment_faeces.RDS")
# saveRDS(assessment_milk, "./MAINHEALTH/assessment_milk.RDS")
# saveRDS(assessment_milkMetab, "./MAINHEALTH/assessment_milkMetab.RDS")
# 
# assessment_faeces = readRDS("./assessment_faeces.RDS")
# assessment_milk = readRDS("./assessment_milk.RDS")
# assessment_milkMetab = readRDS("./assessment_milkMetab.RDS")
# 
# assessment_faeces$plots$overview # 2 or 3 components
# assessment_faeces$plots$TCC[[3]] # 3 components seems safe
# 
# assessment_milk$plots$overview # likely 3 components
# assessment_milk$plots$TCC[[3]] # 3 components seems safe
# 
# assessment_milkMetab$plots$overview # 2 or 3 components, likely 3
# assessment_milkMetab$plots$TCC[[3]] # 3 components seems safe
# 
# a = assessment_faeces$metrics$CORCONDIA %>% as_tibble() %>% mutate(index=1:nrow(.)) %>% pivot_longer(-index) %>% ggplot(aes(x=as.factor(name),y=value)) + geom_boxplot() + xlab("Number of components") + ylab("CORCONDIA") + theme(text=element_text(size=16))
# b = assessment_milk$metrics$CORCONDIA %>% as_tibble() %>% mutate(index=1:nrow(.)) %>% pivot_longer(-index) %>% ggplot(aes(x=as.factor(name),y=value)) + geom_boxplot() + xlab("Number of components") + ylab("CORCONDIA") + theme(text=element_text(size=16))
# c = assessment_milkMetab$metrics$CORCONDIA %>% as_tibble() %>% mutate(index=1:nrow(.)) %>% pivot_longer(-index) %>% ggplot(aes(x=as.factor(name),y=value)) + geom_boxplot() + xlab("Number of components") + ylab("CORCONDIA") + theme(text=element_text(size=16))
# ggarrange(a,b,c)
# colourCols = c("", "", "")
# legendTitles = c("BMI group", "Phylum", "")
# xLabels = c("Subject index", "Feature index", "Time index")
# legendColNums = c(3,5,0)
# arrangeModes = c(TRUE, TRUE, FALSE)
# continuousModes = c(FALSE,FALSE,TRUE)
# 
# stability_faeces = parafac4microbiome::assessModelStability(processedFaeces, maxNumComponents=3, numFolds=20, colourCols=colourCols, legendTitles=legendTitles, xLabels = xLabels, legendColNums=legendColNums, arrangeModes=arrangeModes, numCores=parallel::detectCores())
# stability_faeces$modelPlots[[2]]
# stability_faeces$modelPlots[[3]]
# 
# stability_milk = parafac4microbiome::assessModelStability(processedMilk, maxNumComponents=3, numFolds=10, colourCols=colourCols, legendTitles=legendTitles, xLabels = xLabels, legendColNums=legendColNums, arrangeModes=arrangeModes, numCores=parallel::detectCores())
# stability_milk$modelPlots[[2]]
# stability_milk$modelPlots[[3]]
# 
# stability_milkMetab = parafac4microbiome::assessModelStability(processedMilkMetab, maxNumComponents=4, numFolds=10, colourCols=c("", "", ""), legendTitles=c("BMI group", "Class", ""), xLabels = xLabels, legendColNums=legendColNums, arrangeModes=arrangeModes, numCores=parallel::detectCores())
# stability_milkMetab$modelPlots[[3]]
# stability_milkMetab$modelPlots[[4]]

# 
# stability_faeces$modelPlots[[3]] # 3 components seems ok
# stability_milk$modelPlots[[3]] # 3 components very unstable
# stability_milk$modelPlots[[2]] # 2 components still a bit unstable
# stability_milkMetab$modelPlots[[3]] # 3 components ok
# cp_faeces = parafac4microbiome::parafac(processedFaeces$data, nfac=3, nstart = 100)
# cp_milk = parafac4microbiome::parafac(processedMilk$data, nfac=2, nstart = 100)
# cp_milkMetab = parafac4microbiome::parafac(processedMilkMetab$data, nfac=3, nstart = 100)
# 
# saveRDS(cp_faeces, "./MAINHEALTH/cp_faeces.RDS")
# saveRDS(cp_milk, "./MAINHEALTH/cp_milk.RDS")
# saveRDS(cp_milkMetab, "./MAINHEALTH/cp_milkMetab.RDS")

cp_faeces = readRDS("./MAINHEALTH/cp_faeces.RDS")
cp_milk = readRDS("./MAINHEALTH/cp_milk.RDS")
cp_milkMetab = readRDS("./MAINHEALTH/cp_milkMetab.RDS")

Faecal microbiome

testMetadata(cp_faeces, 1, processedFaeces)
##        Term   Estimate                                         CI    P_value
## 1       BMI   1.557682 -0.00109277376368493 – 0.00420813678843941 0.24698916
## 2 C.section -17.096372   -0.0655708277484862 – 0.0313780847296313 0.48646498
## 3  Secretor  11.375601   -0.0221113148908448 – 0.0448625169392564 0.50262459
## 4     Lewis -86.360039   -0.181248173947069 – 0.00852809560447294 0.07407419
## 5    whz.6m  -3.321885   -0.0160020917474101 – 0.0093583227214069 0.60504067
##    P_adjust
## 1 0.6050407
## 2 0.6050407
## 3 0.6050407
## 4 0.3703709
## 5 0.6050407
testMetadata(cp_faeces, 2, processedFaeces)
##        Term   Estimate                                          CI    P_value
## 1       BMI   2.722216 -4.56020479994157e-06 – 0.00544899180328022 0.05037851
## 2 C.section  46.004176     -0.003866118532671 – 0.0958744696412498 0.07028311
## 3  Secretor  -7.108612      -0.041559794510577 – 0.027342570748067 0.68370085
## 4     Lewis -60.366832     -0.157987302185625 – 0.0372536372194021 0.22330820
## 5    whz.6m   1.450616    -0.0115947217747149 – 0.0144959541369427 0.82617195
##    P_adjust
## 1 0.1757078
## 2 0.1757078
## 3 0.8261720
## 4 0.3721803
## 5 0.8261720

HM microbiome

testMetadata(cp_milk, 1, processedMilk)
##        Term    Estimate                                         CI   P_value
## 1       BMI  -0.7971158 -0.00356908227805278 – 0.00197485067980183 0.5702715
## 2 C.section -27.2768492   -0.0779905182386613 – 0.0234368199177587 0.2891367
## 3  Secretor   4.8853309    -0.0301641355434441 – 0.039934797247701 0.7830997
## 4     Lewis  28.0599419     -0.071151776413022 – 0.127271660144112 0.5766273
## 5    whz.6m   2.3991703    -0.0108741423856309 – 0.015672483059881 0.7211329
##    P_adjust
## 1 0.7830997
## 2 0.7830997
## 3 0.7830997
## 4 0.7830997
## 5 0.7830997
testMetadata(cp_milk, 2, processedMilk)
##        Term   Estimate                                         CI    P_value
## 1       BMI   2.942392 0.000305899155017146 – 0.00557888463379954 0.02901701
## 2 C.section  25.569124   -0.0226660263261727 – 0.0738042746886722 0.29612537
## 3  Secretor   6.165320   -0.0271711809286324 – 0.0395018213417433 0.71494998
## 4     Lewis -74.787457    -0.169150421245261 – 0.0195755080850167 0.11927081
## 5    whz.6m  -3.204212  -0.0158288212471368 – 0.00942039651018406 0.61630886
##    P_adjust
## 1 0.1450850
## 2 0.4935423
## 3 0.7149500
## 4 0.2981770
## 5 0.7149500
topIndices = processedMilk$mode2 %>% mutate(index=1:nrow(.), Comp = cp_milk$Fac[[2]][,2]) %>% arrange(desc(Comp)) %>% head() %>% select(index) %>% pull()
bottomIndices = processedMilk$mode2 %>% mutate(index=1:nrow(.), Comp = cp_milk$Fac[[2]][,2]) %>% arrange(desc(Comp)) %>% tail() %>% select(index) %>% pull()

Xhat = parafac4microbiome::reinflateTensor(cp_milk$Fac[[1]][,2], cp_milk$Fac[[2]][,2], cp_milk$Fac[[3]][,2])

print("Positive loadings:")
## [1] "Positive loadings:"
print(cor(Xhat[,topIndices[1],1], processedMilk$mode1$BMI)) # no flip
## [1] 0.1705408
print("Negative loadings:")
## [1] "Negative loadings:"
print(cor(Xhat[,bottomIndices[1],1], processedMilk$mode1$BMI))
## [1] -0.1705408
a = processedMilk$mode1 %>%
  mutate(Component_2 = cp_milk$Fac[[1]][,2]) %>%
  ggplot(aes(x=BMI,y=Component_2)) +
  geom_point() +
  stat_cor() +
  xlab("ppBMI") +
  ylab("Loading") +
  theme(text=element_text(size=16))

df = processedMilk$mode2 %>%
  mutate(Component_2 = cp_milk$Fac[[2]][,2]) %>%
  arrange(Component_2) %>%
  mutate(index=1:nrow(.)) %>%
  mutate(Genus = gsub("g__", "", V7), Species = gsub("s__", "", V8)) %>% 
  mutate(Species = str_split_fixed(Species, "_", 2)[,2]) %>% 
  mutate(dplyr::across(.cols = Species, .fns = ~ dplyr::if_else(stringr::str_detect(.x, "sp."), "", .x))) %>%
  mutate(dplyr::across(.cols = Species, .fns = ~ dplyr::if_else(stringr::str_detect(.x, "organism"), "", .x))) %>%
  mutate(dplyr::across(.cols = Species, .fns = ~ dplyr::if_else(stringr::str_detect(.x, "bacterium"), "", .x))) %>%
  filter(index %in% c(1:10, 106:115))

df[df$Genus == "" & df$Species == "", "Species"] = "Unknown"
df[df$Species == "", "Species"] = "sp."
df$name = paste0(df$Genus, " ", df$Species)
  
b = df %>% ggplot(aes(x=Component_2,y=as.factor(index),fill=as.factor(V3))) +
  geom_bar(stat="identity", col="black") +
  xlab("Loading") +
  ylab("") +
  scale_y_discrete(labels=df$name) +
  scale_fill_manual(name="Phylum", values=hue_pal()(5)[-c(2,5)], labels=c("Actinobacteriota", "Firmicutes", "Proteobacteria")) +
  theme(text=element_text(size=16))

c = cp_milk$Fac[[3]][,2] %>%
  as_tibble() %>% 
  mutate(value=value) %>% 
  ggplot(aes(x=c(0,30,60,90),y=value)) +
  geom_line() +
  geom_point() +
  xlab("Time point [day]") +
  ylab("Loading") +
  ylim(0,1) +
  theme(text=element_text(size=16))

a

b

c

HM metabolome

testMetadata(cp_milkMetab, 1, processedMilkMetab)
##        Term  Estimate                                          CI    P_value
## 1       BMI -2.583661 -0.00535718628987143 – 0.000189863674592802 0.06760372
## 2 C.section -9.956381    -0.0605921878622612 – 0.0406794268355845 0.69782750
## 3  Secretor  9.252236    -0.0261442697993304 – 0.0446487419625543 0.60584696
## 4     Lewis 92.519158     -0.0281341347345387 – 0.213172450952438 0.13163311
## 5    whz.6m 10.497357   -0.00275014531882957 – 0.0237448598441314 0.11934771
##    P_adjust
## 1 0.2193885
## 2 0.6978275
## 3 0.6978275
## 4 0.2193885
## 5 0.2193885
testMetadata(cp_milkMetab, 2, processedMilkMetab)
##        Term    Estimate                                        CI      P_value
## 1       BMI   0.1116855 -0.00080493346242961 – 0.0010283045137446 8.098366e-01
## 2 C.section  11.0721065 -0.00566246154228743 – 0.0278066745141615 1.927839e-01
## 3  Secretor 177.1666463     0.165468496949462 – 0.188864795602086 6.443800e-59
## 4     Lewis -37.4429441 -0.0773175080477337 – 0.00243161979792261 6.545970e-02
## 5    whz.6m   1.6165731 -0.0027615783945977 – 0.00599472449699943 4.662904e-01
##       P_adjust
## 1 8.098366e-01
## 2 3.213065e-01
## 3 3.221900e-58
## 4 1.636492e-01
## 5 5.828630e-01
testMetadata(cp_milkMetab, 3, processedMilkMetab)
##        Term   Estimate                                          CI   P_value
## 1       BMI  -1.716400 -0.00415552494908338 – 0.000722724998462952 0.1661829
## 2 C.section -22.466123    -0.0669968421278869 – 0.0220645963398689 0.3199733
## 3  Secretor  -6.672137    -0.0378009361212165 – 0.0244566617238421 0.6721456
## 4     Lewis -22.466704     -0.128572999236914 – 0.0836395906101599 0.6758950
## 5    whz.6m   3.066525   -0.00858374497479842 – 0.0147167946587828 0.6033337
##   P_adjust
## 1 0.675895
## 2 0.675895
## 3 0.675895
## 4 0.675895
## 5 0.675895
topIndices = processedMilkMetab$mode2 %>% mutate(index=1:nrow(.), Comp = cp_milkMetab$Fac[[2]][,2]) %>% arrange(desc(Comp)) %>% head() %>% select(index) %>% pull()
bottomIndices = processedMilkMetab$mode2 %>% mutate(index=1:nrow(.), Comp = cp_milkMetab$Fac[[2]][,2]) %>% arrange(desc(Comp)) %>% tail() %>% select(index) %>% pull()

Xhat = parafac4microbiome::reinflateTensor(cp_milkMetab$Fac[[1]][,2], cp_milkMetab$Fac[[2]][,2], cp_milkMetab$Fac[[3]][,2])

print("Positive loadings:")
## [1] "Positive loadings:"
print(cor(Xhat[,topIndices[1],1], processedMilkMetab$mode1$Secretor)) # flip
## [1] -0.9257706
print("Negative loadings:")
## [1] "Negative loadings:"
print(cor(Xhat[,bottomIndices[1],1], processedMilkMetab$mode1$Secretor))
## [1] 0.9257706
a = processedMilkMetab$mode1 %>%
  mutate(Component_2 = cp_milkMetab$Fac[[1]][,2]) %>%
  ggplot(aes(x=as.factor(Secretor),y=Component_2)) +
  geom_boxplot() +
  stat_compare_means() +
  xlab("Secretor status") +
  ylab("Loading") +
  scale_x_discrete(labels=c("Se-", "Se+")) +
  theme(text=element_text(size=16))

temp = processedMilkMetab$mode2 %>%
  mutate(Component_2 = -1*cp_milkMetab$Fac[[2]][,2]) %>%
  arrange(Component_2) %>%
  mutate(index=1:nrow(.)) %>%
  filter(index %in% c(1:10, 61:70))

b=temp %>%
  ggplot(aes(x=Component_2,y=as.factor(index),fill=as.factor(Class),pattern=as.factor(Class))) +
  geom_bar_pattern(stat="identity",
                   colour="black",
                   pattern="stripe",
                   pattern_fill="black",
                   pattern_density=0.2,
                   pattern_spacing=0.05,
                   pattern_angle=45,
                   pattern_size=0.2) +
  scale_y_discrete(label=temp$Metabolite) +
  xlab("Loading") +
  ylab("") +
  scale_fill_manual(name="Class", values=hue_pal()(7)[-c(4,5)], labels=c("Amino acids and derivatives", "Energy related", "Fatty acids and derivatives", "Oligosaccharides", "Sugars")) +
  theme(text=element_text(size=16))

c = cp_milkMetab$Fac[[3]][,2] %>%
  as_tibble() %>% 
  mutate(value=value*-1) %>% 
  ggplot(aes(x=c(0,30,60,90),y=value)) +
  geom_line() +
  geom_point() +
  xlab("Time point [day]") +
  ylab("Loading") +
  ylim(0,1) +
  theme(text=element_text(size=16))

a

b

c