GOHTRANS: gender-affirming hormone therapy

Introduction

This article presents the CP results of the GOHTRANS dataset discussed in Chapter 6 of my dissertation.

Preamble

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(tidyr)
library(parafac4microbiome)
library(NPLStoolbox)
library(CMTFtoolbox)

## 
## Attaching package: 'CMTFtoolbox'

## The following object is masked from 'package:NPLStoolbox':
## 
##     npred

## The following objects are masked from 'package:parafac4microbiome':
## 
##     fac_to_vect, reinflateFac, reinflateTensor, vect_to_fac

library(ggplot2)
library(ggpubr)
library(readr)

ph_BOMP = read_delim("./GOHTRANS/GOH-TRANS_export_20240205.csv",
                     delim = ";", escape_double = FALSE, trim_ws = TRUE) %>% as_tibble()

## Rows: 42 Columns: 347
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ";"
## chr (104): Participant Status, Site Abbreviation, Participant Creation Date,...
## dbl (243): Participant Id, pat_genderID, Age, Informed_consent#Yes, Informed...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

df1 = ph_BOMP %>% select(`Participant Id`, starts_with("5.")) %>% mutate(subject = 1:42, numTeeth = `5.1|Number of teeth`, DMFT = `5.2|DMFT`, numBleedingSites = `5.3|Bleeding sites`, boppercent = `5.4|BOP%`, DPSI = `5.5|DPSI`, pH = `5.8|pH`) %>% select(subject, numTeeth, DMFT, numBleedingSites, boppercent, DPSI, pH)
df2 = ph_BOMP %>% select(`Participant Id`, starts_with("12.")) %>% mutate(subject = 1:42, numTeeth = `12.1|Number of teeth`, DMFT = `12.2|DMFT`, numBleedingSites = `12.3|Bleeding sites`, boppercent = `12.4|BOP%`, DPSI = `12.5|DPSI`, pH = `12.8|pH`) %>% select(subject, numTeeth, DMFT, numBleedingSites, boppercent, DPSI, pH)
df3 = ph_BOMP %>% select(`Participant Id`, starts_with("19.")) %>% mutate(subject = 1:42, numTeeth = `19.1|Number of teeth`, DMFT = `19.2|DMFT`, numBleedingSites = `19.3|Bleeding sites`, boppercent = `19.4|BOP%`, DPSI = `19.5|DPSI`, pH = `19.8|pH`) %>% select(subject, numTeeth, DMFT, numBleedingSites, boppercent, DPSI, pH)
df4 = ph_BOMP %>% select(`Participant Id`, starts_with("26.")) %>% mutate(subject = 1:42, numTeeth = `26.1|Number of teeth`, DMFT = `26.2|DMFT`, numBleedingSites = `26.3|Bleeding sites`, boppercent = `26.4|BOP%`, DPSI = `26.5|DPSI`, pH = `26.8|pH`) %>% select(subject, numTeeth, DMFT, numBleedingSites, boppercent, DPSI, pH)

otherMeta = rbind(df1, df2, df3, df4) %>% as_tibble() %>% mutate(newTimepoint = rep(c(0,3,6,12), each=42))
otherMeta = otherMeta %>% select(subject, newTimepoint, boppercent)
otherMeta[otherMeta$boppercent < 0 & !is.na(otherMeta$boppercent), "boppercent"] = NA

saliva_sampleMeta = read.csv("./GOHTRANS/sampleInfo_fixed.csv", sep=" ") %>% as_tibble() %>% select(subject, GenderID, Age, newTimepoint)

# CODS XI
df = read.csv("./GOHTRANS/CODS_XI.csv")
df = df[1:40,1:10] %>% as_tibble() %>% filter(Participant_code %in% saliva_sampleMeta$subject)

CODS = df %>% select(Participant_code, CODS_BASELINE, CODS_3_MONTHS, CODS_6_MONTHS, CODS_12_MONTHS) %>% pivot_longer(-Participant_code) %>% mutate(timepoint = NA)
CODS[CODS$name == "CODS_BASELINE", "timepoint"] = 0
CODS[CODS$name == "CODS_3_MONTHS", "timepoint"] = 3
CODS[CODS$name == "CODS_6_MONTHS", "timepoint"] = 6
CODS[CODS$name == "CODS_12_MONTHS", "timepoint"] = 12
CODS[CODS$value == -99, "value"] = NA
CODS = CODS %>% mutate(subject = Participant_code, newTimepoint = timepoint, CODS = value) %>% select(-Participant_code, -name, -value, -timepoint)

XI = df %>% select(Participant_code, XI_BASELINE, XI_3_MONTHS, XI_6_MONTHS, XI_12_MONTHS) %>% pivot_longer(-Participant_code) %>% mutate(timepoint = NA)
XI[XI$name == "XI_BASELINE", "timepoint"] = 0
XI[XI$name == "XI_3_MONTHS", "timepoint"] = 3
XI[XI$name == "XI_6_MONTHS", "timepoint"] = 6
XI[XI$name == "XI_12_MONTHS", "timepoint"] = 12
XI[XI$value == -99, "value"] = NA
XI = XI %>% mutate(subject = Participant_code, newTimepoint = timepoint, XI = value) %>% select(-Participant_code, -name, -value, -timepoint)

df = otherMeta %>% left_join(CODS) %>% left_join(XI) %>% select(subject, newTimepoint, boppercent, CODS, XI) %>% filter(newTimepoint != "NA")

## Joining with `by = join_by(subject, newTimepoint)`
## Joining with `by = join_by(subject, newTimepoint)`

meta = df %>% left_join(otherMeta) %>% left_join(saliva_sampleMeta) %>% unique()

## Joining with `by = join_by(subject, newTimepoint, boppercent)`
## Joining with `by = join_by(subject, newTimepoint)`

meta[meta < 0] <- NA
meta$GenderID = as.numeric(as.factor(meta$GenderID)) - 1

staticMeta = meta %>% filter(newTimepoint == 0) %>% select(subject, GenderID, Age)
dynamicMeta = meta %>% select(-GenderID, -Age)
# 
# newBOP = residuals(lm(boppercent ~ GenderID + Age + CODS + XI, data=meta, na.action=na.exclude))
# newCODS= residuals(lm(CODS ~ boppercent + GenderID + Age + XI, data=meta, na.action=na.exclude))
# newXI = residuals(lm(XI ~ boppercent + GenderID + Age + CODS, data=meta, na.action=na.exclude))
# 
# meta = meta %>% mutate(boppercent = newBOP, CODS = newCODS, XI = newXI)

clinical_metadata = NPLStoolbox::Cornejo2025$Clinical_measurements$mode1 %>% mutate(BOP = NPLStoolbox::Cornejo2025$Clinical_measurements$data[,1,2], CODS = NPLStoolbox::Cornejo2025$Clinical_measurements$data[,2,2], XI = NPLStoolbox::Cornejo2025$Clinical_measurements$data[,3,2]) %>% left_join(NPLStoolbox::Cornejo2025$Subject_metadata %>% mutate(subject = as.character(subject)))

## Joining with `by = join_by(subject, GenderID)`

testMetadata = function(model, comp, metadata){
  transformedSubjectLoadings = model$Fac[[1]][,comp]
  transformedSubjectLoadings = transformedSubjectLoadings / norm(transformedSubjectLoadings, "2")
  
  metadata = metadata$mode1 %>% left_join(clinical_metadata) %>% mutate(GenderID = as.numeric(as.factor(GenderID))-1)
  
  result = lm(transformedSubjectLoadings ~ GenderID + Age + BOP + CODS + XI, data=metadata)
  
  # Extract coefficients and confidence intervals
  coef_estimates <- summary(result)$coefficients
  conf_intervals <- confint(result)
  
  # Remove intercept
  coef_estimates <- coef_estimates[rownames(coef_estimates) != "(Intercept)", ]
  conf_intervals <- conf_intervals[rownames(conf_intervals) != "(Intercept)", ]
  
  # Combine into a clean data frame
  summary_table <- data.frame(
    Term     = rownames(coef_estimates),
    Estimate = coef_estimates[, "Estimate"] * 1e3,
    CI       = paste0(
                 conf_intervals[, 1], " – ",
                 conf_intervals[, 2]
               ),
    P_value  = coef_estimates[, "Pr(>|t|)"],
    P_adjust = p.adjust(coef_estimates[, "Pr(>|t|)"], "BH"),
    row.names = NULL
  )
  
  return(summary_table)
}

processedTongue = processDataCube(NPLStoolbox::Cornejo2025$Tongue_microbiome, sparsityThreshold = 0.5, considerGroups=TRUE, groupVariable="GenderID", CLR=TRUE, centerMode=1, scaleMode=2)
processedSaliva = processDataCube(NPLStoolbox::Cornejo2025$Salivary_microbiome, sparsityThreshold = 0.5, considerGroups=TRUE, groupVariable="GenderID", CLR=TRUE, centerMode=1, scaleMode=2)

processedCytokines = NPLStoolbox::Cornejo2025$Salivary_cytokines
processedCytokines$data = log(processedCytokines$data + 0.1300)
# Remove subject 1, 18, 19, and 27 due to being an outlier
processedCytokines$data = processedCytokines$data[-c(1,9,10,19),,]
processedCytokines$mode1 = processedCytokines$mode1[-c(1,9,10,19),]
processedCytokines$data = multiwayCenter(processedCytokines$data, 1)
processedCytokines$data = multiwayScale(processedCytokines$data, 2)

processedBiochemistry = NPLStoolbox::Cornejo2025$Salivary_biochemistry
processedBiochemistry$data = log(processedBiochemistry$data)
processedBiochemistry$data = multiwayCenter(processedBiochemistry$data, 1)
processedBiochemistry$data = multiwayScale(processedBiochemistry$data, 2)

colourCols = c("", "", "")
legendTitles = c("", "", "")
xLabels = c("Subject index", "Feature index", "Time index")
legendColNums = c(2,5,0)
arrangeModes = c(FALSE, FALSE, FALSE)
continuousModes = c(FALSE,FALSE,TRUE)

Tongue microbiome

CP was applied to capture the dominant source of variation in the tongue microbiome data block. The model selection procedure revealed that CORCONDIA values were too low at three components. The commented code below performs the procedure, and is not rerun here due to computational limitations.

# assessment_tongue = parafac4microbiome::assessModelQuality(processedTongue$data, numRepetitions=10, numCores=10)
# assessment_tongue$plots$overview # 1 or 2?
# a = assessment_tongue$metrics$CORCONDIA %>% as_tibble() %>% mutate(index=1:nrow(.)) %>% pivot_longer(-index) %>% ggplot(aes(x=as.factor(name),y=value)) + geom_boxplot() + xlab("Number of components") + ylab("CORCONDIA") + theme(text=element_text(size=16))
# 
# stability_tongue = parafac4microbiome::assessModelStability(processedTongue, maxNumComponents=3, numFolds=10, colourCols=colourCols, legendTitles=legendTitles, xLabels = xLabels, legendColNums=legendColNums, arrangeModes=arrangeModes, numCores=parallel::detectCores())
# stability_tongue$modelPlots[[2]]
# stability_tongue$modelPlots[[3]]
#

# cp_tongue = parafac4microbiome::parafac(processedTongue$data, nfac=2, nstart=100)
cp_tongue = readRDS("./GOHTRANS/cp_tongue.RDS")

cp_tongue$varExp

## [1] 23.00316

Therefore, a two-component CP model was selected, explaining 23.0% of the variation. However, MLR analysis revealed that both components were uninterpretable.

testMetadata(cp_tongue, 1, processedTongue)

## Joining with `by = join_by(subject, GenderID)`

##       Term    Estimate                                         CI    P_value
## 1 GenderID 121.4677838  -2.46115922752504e-05 – 0.242960179217603 0.05004334
## 2      Age   1.2397817    -0.0120682478356951 – 0.014547811241472 0.85003536
## 3      BOP   2.4968042 -0.00497023980355535 – 0.00996384822005768 0.49901353
## 4     CODS   0.5381136   -0.0119608431733796 – 0.0130370704610451 0.93035385
## 5       XI  -4.9945589   -0.0188281123330526 – 0.0088389945248416 0.46571634
##    P_adjust
## 1 0.2502167
## 2 0.9303539
## 3 0.8316892
## 4 0.9303539
## 5 0.8316892

testMetadata(cp_tongue, 2, processedTongue)

## Joining with `by = join_by(subject, GenderID)`

##       Term  Estimate                                         CI   P_value
## 1 GenderID 59.914254    -0.0657307291997982 – 0.185559237353633 0.3370374
## 2      Age -1.942659   -0.0157055547933125 – 0.0118202363189334 0.7746069
## 3      BOP -1.262477 -0.00898474294192472 – 0.00645978948107809 0.7402082
## 4     CODS -4.893111   -0.0178192803655215 – 0.0080330573884708 0.4445972
## 5       XI -6.251158  -0.0205575397778675 – 0.00805522373616812 0.3783908
##    P_adjust
## 1 0.7409954
## 2 0.7746069
## 3 0.7746069
## 4 0.7409954
## 5 0.7409954

Salivary microbiome

CP was applied to capture the dominant source of variation in the salivary microbiome data block. The model selection procedure revealed that CORCONDIA values were too low at three components. The commented code below performs the procedure, and is not rerun here due to computational limitations.

# assessment_saliva = parafac4microbiome::assessModelQuality(processedSaliva$data, numRepetitions=10, numCores=10)
# assessment_saliva$plots$overview # def 2
# b = assessment_saliva$metrics$CORCONDIA %>% as_tibble() %>% mutate(index=1:nrow(.)) %>% pivot_longer(-index) %>% ggplot(aes(x=as.factor(name),y=value)) + geom_boxplot() + xlab("Number of components") + ylab("CORCONDIA") + theme(text=element_text(size=16))
# 
# stability_saliva = parafac4microbiome::assessModelStability(processedSaliva, maxNumComponents=3, numFolds=10, colourCols=colourCols, legendTitles=legendTitles, xLabels = xLabels, legendColNums=legendColNums, arrangeModes=arrangeModes, numCores=parallel::detectCores())
# stability_saliva$modelPlots[[2]]
# stability_saliva$modelPlots[[2]]
#

# cp_saliva = parafac4microbiome::parafac(processedSaliva$data, nfac=2, nstart=100)
cp_saliva = readRDS("./GOHTRANS/cp_saliva.RDS")

cp_saliva$varExp

## [1] 18.86278

Therefore, a two-component CP model was selected, explaining 18.9% of the variation. However, MLR analysis revealed that both components were uninterpretable.

testMetadata(cp_saliva, 1, processedSaliva)

## Joining with `by = join_by(subject, GenderID)`

##       Term    Estimate                                         CI   P_value
## 1 GenderID -45.0221163    -0.178052895480722 – 0.0880086629409456 0.4938634
## 2      Age   3.0529562   -0.0115189644032717 – 0.0176248767956416 0.6710904
## 3      BOP  -1.6631517 -0.00983935629840586 – 0.00651305291998757 0.6800939
## 4     CODS  -0.9060191   -0.0145920276329919 – 0.0127799895072467 0.8931035
## 5       XI   1.7175717   -0.0134297828369515 – 0.0168649263219845 0.8180171
##    P_adjust
## 1 0.8931035
## 2 0.8931035
## 3 0.8931035
## 4 0.8931035
## 5 0.8931035

testMetadata(cp_saliva, 2, processedSaliva)

## Joining with `by = join_by(subject, GenderID)`

##       Term    Estimate                                        CI    P_value
## 1 GenderID 118.0788404  -0.00224170005304913 – 0.238399380941789 0.05412757
## 2      Age   6.5090792 -0.00667058769451647 – 0.0196887461831477 0.32036701
## 3      BOP   3.0035619 -0.00439145876278687 – 0.0103985825138551 0.41245932
## 4     CODS   0.4777822  -0.0119006159669152 – 0.0128561802899763 0.93754365
## 5       XI   0.3906881  -0.0133094337962947 – 0.0140908099806615 0.95383322
##    P_adjust
## 1 0.2706379
## 2 0.6874322
## 3 0.6874322
## 4 0.9538332
## 5 0.9538332

Salivary cytokines

CP was applied to capture the dominant source of variation in the salivary cytokine data block. The model selection procedure revealed that CORCONDIA values were too low at three components. The commented code below performs the procedure, and is not rerun here due to computational limitations.

# assessment_cytokines = parafac4microbiome::assessModelQuality(processedCytokines$data, numRepetitions=10, numCores=10)
# assessment_cytokines$plots$overview # def 2
# c = assessment_cytokines$metrics$CORCONDIA %>% as_tibble() %>% mutate(index=1:nrow(.)) %>% pivot_longer(-index) %>% ggplot(aes(x=as.factor(name),y=value)) + geom_boxplot() + xlab("Number of components") + ylab("CORCONDIA") + theme(text=element_text(size=16))
#
# stability_cytokines = parafac4microbiome::assessModelStability(processedCytokines, maxNumComponents=3, numFolds=10, colourCols=colourCols, legendTitles=legendTitles, xLabels = xLabels, legendColNums=legendColNums, arrangeModes=arrangeModes, numCores=parallel::detectCores())
# stability_cytokines$modelPlots[[2]]
# stability_cytokines$modelPlots[[3]]
#

# cp_cytokines = parafac4microbiome::parafac(processedCytokines$data, nfac=2, nstart=100)
cp_cytokines = readRDS("./GOHTRANS/cp_cytokines.RDS")

cp_cytokines$varExp

## [1] 55.68743

Therefore, a two-component CP model was selected, explaining 55.7% of the variation. However, MLR analysis revealed that both components were uninterpretable.

testMetadata(cp_cytokines, 1, processedCytokines)

## Joining with `by = join_by(subject, GenderID)`

##       Term     Estimate                                        CI   P_value
## 1 GenderID -27.05017814    -0.251532298438523 – 0.197431942164215 0.8007915
## 2      Age  16.67084513 -0.00723963736661261 – 0.0405813276341731 0.1579702
## 3      BOP  -0.09082505  -0.0164640352183346 – 0.0162823851145396 0.9907222
## 4     CODS -28.95668163     -0.402598657967525 – 0.34468529471301 0.8710047
## 5       XI -15.99762836  -0.0429168315278513 – 0.0109215747988014 0.2245843
##    P_adjust
## 1 0.9907222
## 2 0.5614609
## 3 0.9907222
## 4 0.9907222
## 5 0.5614609

testMetadata(cp_cytokines, 2, processedCytokines)

## Joining with `by = join_by(subject, GenderID)`

##       Term   Estimate                                        CI   P_value
## 1 GenderID 162.363038   -0.0550666388070594 – 0.379792714707578 0.1323173
## 2      Age  -6.714968   -0.029874266224001 – 0.0164443312143381 0.5458402
## 3      BOP -10.075196 -0.0259340174258618 – 0.00578362484505684 0.1957476
## 4     CODS  95.954957    -0.265948494090199 – 0.457858408923401 0.5803389
## 5       XI  -1.713409    -0.0277869044719951 – 0.02436008731968 0.8904703
##    P_adjust
## 1 0.4893690
## 2 0.7254237
## 3 0.4893690
## 4 0.7254237
## 5 0.8904703

Salivary biochemistry

CP was applied to capture the dominant source of variation in the salivary biochemistry data block. The model selection procedure revealed that CORCONDIA values were inconclusive and loadings were unstable at three components. The commented code below performs the procedure, and is not rerun here due to computational limitations.

# assessment_biochemistry = parafac4microbiome::assessModelQuality(processedBiochemistry$data, numRepetitions=10, numCores=10)
# assessment_biochemistry$plots$overview # 2 or 3?
# 
# d = assessment_biochemistry$metrics$CORCONDIA %>% as_tibble() %>% mutate(index=1:nrow(.)) %>% pivot_longer(-index) %>% ggplot(aes(x=as.factor(name),y=value)) + geom_boxplot() + xlab("Number of components") + ylab("CORCONDIA") + theme(text=element_text(size=16))

# stability_biochemistry = parafac4microbiome::assessModelStability(processedBiochemistry, maxNumComponents=3, numFolds=10, colourCols=colourCols, legendTitles=legendTitles, xLabels = xLabels, legendColNums=legendColNums, arrangeModes=arrangeModes, numCores=parallel::detectCores())
# stability_biochemistry$modelPlots[[2]]
# stability_biochemistry$modelPlots[[3]]

# cp_biochemistry = parafac4microbiome::parafac(processedBiochemistry$data, nfac=2, nstart=100)
cp_biochemistry = readRDS("./GOHTRANS/cp_biochemistry.RDS")

cp_biochemistry$varExp

## [1] 35.60952

Therefore, a two-component CP model was selected, explaining 35.6% of the variation. However, MLR analysis revealed that both components were uninterpretable.

testMetadata(cp_biochemistry, 1, processedBiochemistry)

## Joining with `by = join_by(subject, GenderID)`

##       Term  Estimate                                         CI    P_value
## 1 GenderID 81.395726    -0.0343343553682499 – 0.197125807465893 0.16076105
## 2      Age  3.177143  -0.00949969477808753 – 0.0158539800238245 0.61171034
## 3      BOP  6.688526 -0.000424360420959855 – 0.0138014124523289 0.06428731
## 4     CODS  4.938192  -0.00696794674040549 – 0.0168443303204457 0.40275497
## 5       XI  8.405007  -0.00477242944168497 – 0.0215824425531161 0.20199880
##    P_adjust
## 1 0.3366647
## 2 0.6117103
## 3 0.3214366
## 4 0.5034437
## 5 0.3366647

testMetadata(cp_biochemistry, 2, processedBiochemistry)

## Joining with `by = join_by(subject, GenderID)`