Skip to contents

Processing

# Faeces
newFaeces = NPLStoolbox::Jakobsen2025$faeces

mask = newFaeces$mode1$subject %in% (NPLStoolbox::Jakobsen2025$faeces$mode1 %>% filter(!is.na(BMI)) %>% select(subject) %>% pull())
newFaeces$data = newFaeces$data[mask,,]
newFaeces$mode1 = newFaeces$mode1[mask,]
processedFaeces = parafac4microbiome::processDataCube(newFaeces, sparsityThreshold = 0.75, centerMode=1, scaleMode=2)

# Milk
newMilk = NPLStoolbox::Jakobsen2025$milkMicrobiome

mask = newMilk$mode1$subject %in% (NPLStoolbox::Jakobsen2025$milkMicrobiome$mode1 %>% filter(!is.na(BMI)) %>% select(subject) %>% pull())
newMilk$data = newMilk$data[mask,,]
newMilk$mode1 = newMilk$mode1[mask,]
processedMilk = parafac4microbiome::processDataCube(newMilk, sparsityThreshold=0.85, centerMode=1, scaleMode=2)

# Milk metab
newMilkMetab = NPLStoolbox::Jakobsen2025$milkMetabolomics

mask = newMilkMetab$mode1$subject %in% (NPLStoolbox::Jakobsen2025$milkMetabolomics$mode1 %>% filter(!is.na(BMI)) %>% select(subject) %>% pull())
newMilkMetab$data = newMilkMetab$data[mask,,]
newMilkMetab$mode1 = newMilkMetab$mode1[mask,]
processedMilkMetab = parafac4microbiome::processDataCube(newMilkMetab, CLR=FALSE, centerMode=1, scaleMode=2)
Y = processedFaeces$mode1$BMI
Ycnt = Y - mean(Y)
Ynorm_faecal = Ycnt / norm(Ycnt, "2")

Y = processedMilk$mode1$BMI
Ycnt = Y - mean(Y)
Ynorm_milk = Ycnt / norm(Ycnt, "2")

Y = processedMilkMetab$mode1$BMI
Ycnt = Y - mean(Y)
Ynorm_milkMetab = Ycnt / norm(Ycnt, "2")
CV_faecal = NPLStoolbox::ncrossreg(processedFaeces$data, Ynorm_faecal, maxNumComponents = 5, cvFolds=10)
CV_milk = NPLStoolbox::ncrossreg(processedMilk$data, Ynorm_milk, maxNumComponents = 5, cvFolds=10)
CV_milkMetab = NPLStoolbox::ncrossreg(processedMilkMetab$data, Ynorm_milkMetab, maxNumComponents = 5, cvFolds=10)

a=CV_faecal$varExp %>% pivot_longer(-numComponents) %>% ggplot(aes(x=as.factor(numComponents),y=value,col=as.factor(name),group=as.factor(name))) + geom_line() + geom_point() + xlab("Number of components") + ylab("Variance explained (%)") + theme(legend.position="none")
b=CV_faecal$RMSE %>% ggplot(aes(x=numComponents,y=RMSE)) + geom_line() + geom_point() + xlab("Number of components") + ylab("RMSECV")
ggarrange(a,b) # 1 comp


a=CV_milk$varExp %>% pivot_longer(-numComponents) %>% ggplot(aes(x=as.factor(numComponents),y=value,col=as.factor(name),group=as.factor(name))) + geom_line() + geom_point() + xlab("Number of components") + ylab("Variance explained (%)") + theme(legend.position="none")
b=CV_milk$RMSE %>% ggplot(aes(x=numComponents,y=RMSE)) + geom_line() + geom_point() + xlab("Number of components") + ylab("RMSECV")
ggarrange(a,b) # 1 or 2 comps


a=CV_milkMetab$varExp %>% pivot_longer(-numComponents) %>% ggplot(aes(x=as.factor(numComponents),y=value,col=as.factor(name),group=as.factor(name))) + geom_line() + geom_point() + xlab("Number of components") + ylab("Variance explained (%)") + theme(legend.position="none")
b=CV_milkMetab$RMSE %>% ggplot(aes(x=numComponents,y=RMSE)) + geom_line() + geom_point() + xlab("Number of components") + ylab("RMSECV")
ggarrange(a,b) # 1 comp

NPLS_faecal = NPLStoolbox::triPLS1(processedFaeces$data, Ynorm_faecal, 1)
NPLS_milk = NPLStoolbox::triPLS1(processedMilk$data, Ynorm_milk, 1)
NPLS_milkMetab = NPLStoolbox::triPLS1(processedMilkMetab$data, Ynorm_milkMetab, 2)
testMetadata = function(model, comp, metadata){
  transformedSubjectLoadings = transformPARAFACloadings(model$Fac, 1)[,comp]
  transformedSubjectLoadings = transformedSubjectLoadings / norm(transformedSubjectLoadings, "2")
  
  result = lm(transformedSubjectLoadings ~ BMI + C.section + Secretor + Lewis + whz.6m, data=metadata$mode1)
  
  # Extract coefficients and confidence intervals
  coef_estimates <- summary(result)$coefficients
  conf_intervals <- confint(result)
  
  # Remove intercept
  coef_estimates <- coef_estimates[rownames(coef_estimates) != "(Intercept)", ]
  conf_intervals <- conf_intervals[rownames(conf_intervals) != "(Intercept)", ]
  
  # Combine into a clean data frame
  summary_table <- data.frame(
    Term     = rownames(coef_estimates),
    Estimate = coef_estimates[, "Estimate"] * 1e3,
    CI       = paste0(
                 conf_intervals[, 1], " – ",
                 conf_intervals[, 2]
               ),
    P_value  = coef_estimates[, "Pr(>|t|)"],
    P_adjust = p.adjust(coef_estimates[, "Pr(>|t|)"], "BH"),
    row.names = NULL
  )
  
  return(summary_table)
}

testMetadata(NPLS_faecal, 1, processedFaeces)
#>        Term   Estimate                                        CI      P_value
#> 1       BMI   5.898149 0.00344491075499029 – 0.00835138722475675 5.296571e-06
#> 2 C.section  26.215456  -0.0186520767670303 – 0.0710829891553144 2.497335e-01
#> 3  Secretor  18.704297  -0.0122908996607013 – 0.0496994936374461 2.346150e-01
#> 4     Lewis -67.425763    -0.155253390661857 – 0.020401865528336 1.311911e-01
#> 5    whz.6m   1.488789  -0.0102479002724083 – 0.0132254776748785 8.021876e-01
#>       P_adjust
#> 1 2.648286e-05
#> 2 3.121669e-01
#> 3 3.121669e-01
#> 4 3.121669e-01
#> 5 8.021876e-01
testMetadata(NPLS_milk, 1, processedMilk)
#>        Term    Estimate                                       CI      P_value
#> 1       BMI   5.2632203 0.0028082806290875 – 0.00771815987488316 4.271728e-05
#> 2 C.section  22.5298730 -0.0223837277098485 – 0.0674434737293296 3.227116e-01
#> 3  Secretor  10.1431497  -0.0208977465387988 – 0.041184045914344 5.189801e-01
#> 4     Lewis -25.4005435  -0.113265523144618 – 0.0624644361527853 5.682343e-01
#> 5    whz.6m  -0.5937088  -0.012348966843658 – 0.0111615493028556 9.205334e-01
#>       P_adjust
#> 1 0.0002135864
#> 2 0.7102929209
#> 3 0.7102929209
#> 4 0.7102929209
#> 5 0.9205334443
testMetadata(NPLS_milkMetab, 1, processedMilkMetab)
#>        Term   Estimate                                         CI      P_value
#> 1       BMI   8.670681   0.00664584079115655 – 0.0106955221180947 5.530018e-14
#> 2 C.section  11.074774   -0.0258924224871216 – 0.0480419695984113 5.543102e-01
#> 3  Secretor  19.681300  -0.00616028587042244 – 0.0455228862023762 1.342499e-01
#> 4     Lewis -78.258205   -0.166342392359317 – 0.00982598189078401 8.113566e-02
#> 5    whz.6m -12.584073 -0.0222555492384812 – -0.00291259607365677 1.118449e-02
#>       P_adjust
#> 1 2.765009e-13
#> 2 5.543102e-01
#> 3 1.678123e-01
#> 4 1.352261e-01
#> 5 2.796123e-02
testMetadata(NPLS_milkMetab, 2, processedMilkMetab)
#>        Term     Estimate                                       CI      P_value
#> 1       BMI   -0.1981968 -0.002175283642196 – 0.00177888998370945 8.430536e-01
#> 2 C.section  -12.7552845  -0.048850646024158 – 0.0233400770550342 4.856154e-01
#> 3  Secretor -111.0853182  -0.136317456123891 – -0.085853180193343 1.501529e-14
#> 4     Lewis   42.9906785  -0.0430161307912073 – 0.128997487797659 3.244425e-01
#> 5    whz.6m    7.0106029 -0.00243278155702489 – 0.016453987320985 1.442727e-01
#>       P_adjust
#> 1 8.430536e-01
#> 2 6.070193e-01
#> 3 7.507646e-14
#> 4 5.407375e-01
#> 5 3.606817e-01
colourCols = c("BMI.group", "V3", "")
legendTitles = c("BMI group", "Phylum", "")
xLabels = c("Subject index", "Feature index", "Time index")
legendColNums = c(3,5,0)
arrangeModes = c(TRUE, TRUE, FALSE)
continuousModes = c(FALSE,FALSE,TRUE)

parafac4microbiome::plotPARAFACmodel(NPLS_faecal$Fac, processedFaeces, 1, colourCols, legendTitles, xLabels, legendColNums, arrangeModes, continuousModes,
  overallTitle = "Infant faecal microbiome")

parafac4microbiome::plotPARAFACmodel(NPLS_milk$Fac, processedMilk, 1, colourCols, legendTitles, xLabels, legendColNums, arrangeModes, continuousModes,
  overallTitle = "HM microbiome")

parafac4microbiome::plotPARAFACmodel(NPLS_milkMetab$Fac, processedMilkMetab, 2, colourCols, legendTitles, xLabels, legendColNums, arrangeModes, continuousModes, overallTitle = "HM metabolome")