
GOHTRANS: gender-affirming hormone therapy
2025-08-31
Source:vignettes/articles/GOHTRANS.Rmd
GOHTRANS.RmdIntroduction
This article presents the CP results of the GOHTRANS dataset discussed in Chapter 6 of my dissertation.
Preamble
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
##
## Attaching package: 'CMTFtoolbox'
## The following object is masked from 'package:NPLStoolbox':
##
## npred
## The following objects are masked from 'package:parafac4microbiome':
##
## fac_to_vect, reinflateFac, reinflateTensor, vect_to_fac
ph_BOMP = read_delim("./GOHTRANS/GOH-TRANS_export_20240205.csv",
delim = ";", escape_double = FALSE, trim_ws = TRUE) %>% as_tibble()## Rows: 42 Columns: 347
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ";"
## chr (104): Participant Status, Site Abbreviation, Participant Creation Date,...
## dbl (243): Participant Id, pat_genderID, Age, Informed_consent#Yes, Informed...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
df1 = ph_BOMP %>% select(`Participant Id`, starts_with("5.")) %>% mutate(subject = 1:42, numTeeth = `5.1|Number of teeth`, DMFT = `5.2|DMFT`, numBleedingSites = `5.3|Bleeding sites`, boppercent = `5.4|BOP%`, DPSI = `5.5|DPSI`, pH = `5.8|pH`) %>% select(subject, numTeeth, DMFT, numBleedingSites, boppercent, DPSI, pH)
df2 = ph_BOMP %>% select(`Participant Id`, starts_with("12.")) %>% mutate(subject = 1:42, numTeeth = `12.1|Number of teeth`, DMFT = `12.2|DMFT`, numBleedingSites = `12.3|Bleeding sites`, boppercent = `12.4|BOP%`, DPSI = `12.5|DPSI`, pH = `12.8|pH`) %>% select(subject, numTeeth, DMFT, numBleedingSites, boppercent, DPSI, pH)
df3 = ph_BOMP %>% select(`Participant Id`, starts_with("19.")) %>% mutate(subject = 1:42, numTeeth = `19.1|Number of teeth`, DMFT = `19.2|DMFT`, numBleedingSites = `19.3|Bleeding sites`, boppercent = `19.4|BOP%`, DPSI = `19.5|DPSI`, pH = `19.8|pH`) %>% select(subject, numTeeth, DMFT, numBleedingSites, boppercent, DPSI, pH)
df4 = ph_BOMP %>% select(`Participant Id`, starts_with("26.")) %>% mutate(subject = 1:42, numTeeth = `26.1|Number of teeth`, DMFT = `26.2|DMFT`, numBleedingSites = `26.3|Bleeding sites`, boppercent = `26.4|BOP%`, DPSI = `26.5|DPSI`, pH = `26.8|pH`) %>% select(subject, numTeeth, DMFT, numBleedingSites, boppercent, DPSI, pH)
otherMeta = rbind(df1, df2, df3, df4) %>% as_tibble() %>% mutate(newTimepoint = rep(c(0,3,6,12), each=42))
otherMeta = otherMeta %>% select(subject, newTimepoint, boppercent)
otherMeta[otherMeta$boppercent < 0 & !is.na(otherMeta$boppercent), "boppercent"] = NA
saliva_sampleMeta = read.csv("./GOHTRANS/sampleInfo_fixed.csv", sep=" ") %>% as_tibble() %>% select(subject, GenderID, Age, newTimepoint)
# CODS XI
df = read.csv("./GOHTRANS/CODS_XI.csv")
df = df[1:40,1:10] %>% as_tibble() %>% filter(Participant_code %in% saliva_sampleMeta$subject)
CODS = df %>% select(Participant_code, CODS_BASELINE, CODS_3_MONTHS, CODS_6_MONTHS, CODS_12_MONTHS) %>% pivot_longer(-Participant_code) %>% mutate(timepoint = NA)
CODS[CODS$name == "CODS_BASELINE", "timepoint"] = 0
CODS[CODS$name == "CODS_3_MONTHS", "timepoint"] = 3
CODS[CODS$name == "CODS_6_MONTHS", "timepoint"] = 6
CODS[CODS$name == "CODS_12_MONTHS", "timepoint"] = 12
CODS[CODS$value == -99, "value"] = NA
CODS = CODS %>% mutate(subject = Participant_code, newTimepoint = timepoint, CODS = value) %>% select(-Participant_code, -name, -value, -timepoint)
XI = df %>% select(Participant_code, XI_BASELINE, XI_3_MONTHS, XI_6_MONTHS, XI_12_MONTHS) %>% pivot_longer(-Participant_code) %>% mutate(timepoint = NA)
XI[XI$name == "XI_BASELINE", "timepoint"] = 0
XI[XI$name == "XI_3_MONTHS", "timepoint"] = 3
XI[XI$name == "XI_6_MONTHS", "timepoint"] = 6
XI[XI$name == "XI_12_MONTHS", "timepoint"] = 12
XI[XI$value == -99, "value"] = NA
XI = XI %>% mutate(subject = Participant_code, newTimepoint = timepoint, XI = value) %>% select(-Participant_code, -name, -value, -timepoint)
df = otherMeta %>% left_join(CODS) %>% left_join(XI) %>% select(subject, newTimepoint, boppercent, CODS, XI) %>% filter(newTimepoint != "NA")## Joining with `by = join_by(subject, newTimepoint)`
## Joining with `by = join_by(subject, newTimepoint)`
## Joining with `by = join_by(subject, newTimepoint, boppercent)`
## Joining with `by = join_by(subject, newTimepoint)`
meta[meta < 0] <- NA
meta$GenderID = as.numeric(as.factor(meta$GenderID)) - 1
staticMeta = meta %>% filter(newTimepoint == 0) %>% select(subject, GenderID, Age)
dynamicMeta = meta %>% select(-GenderID, -Age)
#
# newBOP = residuals(lm(boppercent ~ GenderID + Age + CODS + XI, data=meta, na.action=na.exclude))
# newCODS= residuals(lm(CODS ~ boppercent + GenderID + Age + XI, data=meta, na.action=na.exclude))
# newXI = residuals(lm(XI ~ boppercent + GenderID + Age + CODS, data=meta, na.action=na.exclude))
#
# meta = meta %>% mutate(boppercent = newBOP, CODS = newCODS, XI = newXI)
clinical_metadata = NPLStoolbox::Cornejo2025$Clinical_measurements$mode1 %>% mutate(BOP = NPLStoolbox::Cornejo2025$Clinical_measurements$data[,1,2], CODS = NPLStoolbox::Cornejo2025$Clinical_measurements$data[,2,2], XI = NPLStoolbox::Cornejo2025$Clinical_measurements$data[,3,2]) %>% left_join(NPLStoolbox::Cornejo2025$Subject_metadata %>% mutate(subject = as.character(subject)))## Joining with `by = join_by(subject, GenderID)`
testMetadata = function(model, comp, metadata){
transformedSubjectLoadings = model$Fac[[1]][,comp]
transformedSubjectLoadings = transformedSubjectLoadings / norm(transformedSubjectLoadings, "2")
metadata = metadata$mode1 %>% left_join(clinical_metadata) %>% mutate(GenderID = as.numeric(as.factor(GenderID))-1)
result = lm(transformedSubjectLoadings ~ GenderID + Age + BOP + CODS + XI, data=metadata)
# Extract coefficients and confidence intervals
coef_estimates <- summary(result)$coefficients
conf_intervals <- confint(result)
# Remove intercept
coef_estimates <- coef_estimates[rownames(coef_estimates) != "(Intercept)", ]
conf_intervals <- conf_intervals[rownames(conf_intervals) != "(Intercept)", ]
# Combine into a clean data frame
summary_table <- data.frame(
Term = rownames(coef_estimates),
Estimate = coef_estimates[, "Estimate"] * 1e3,
CI = paste0(
conf_intervals[, 1], " – ",
conf_intervals[, 2]
),
P_value = coef_estimates[, "Pr(>|t|)"],
P_adjust = p.adjust(coef_estimates[, "Pr(>|t|)"], "BH"),
row.names = NULL
)
return(summary_table)
}
processedTongue = processDataCube(NPLStoolbox::Cornejo2025$Tongue_microbiome, sparsityThreshold = 0.5, considerGroups=TRUE, groupVariable="GenderID", CLR=TRUE, centerMode=1, scaleMode=2)
processedSaliva = processDataCube(NPLStoolbox::Cornejo2025$Salivary_microbiome, sparsityThreshold = 0.5, considerGroups=TRUE, groupVariable="GenderID", CLR=TRUE, centerMode=1, scaleMode=2)
processedCytokines = NPLStoolbox::Cornejo2025$Salivary_cytokines
processedCytokines$data = log(processedCytokines$data + 0.1300)
# Remove subject 1, 18, 19, and 27 due to being an outlier
processedCytokines$data = processedCytokines$data[-c(1,9,10,19),,]
processedCytokines$mode1 = processedCytokines$mode1[-c(1,9,10,19),]
processedCytokines$data = multiwayCenter(processedCytokines$data, 1)
processedCytokines$data = multiwayScale(processedCytokines$data, 2)
processedBiochemistry = NPLStoolbox::Cornejo2025$Salivary_biochemistry
processedBiochemistry$data = log(processedBiochemistry$data)
processedBiochemistry$data = multiwayCenter(processedBiochemistry$data, 1)
processedBiochemistry$data = multiwayScale(processedBiochemistry$data, 2)Tongue microbiome
CP was applied to capture the dominant source of variation in the tongue microbiome data block. The model selection procedure revealed that CORCONDIA values were too low at three components. The commented code below performs the procedure, and is not rerun here due to computational limitations.
# assessment_tongue = parafac4microbiome::assessModelQuality(processedTongue$data, numRepetitions=10, numCores=10)
# assessment_tongue$plots$overview # 1 or 2?
# a = assessment_tongue$metrics$CORCONDIA %>% as_tibble() %>% mutate(index=1:nrow(.)) %>% pivot_longer(-index) %>% ggplot(aes(x=as.factor(name),y=value)) + geom_boxplot() + xlab("Number of components") + ylab("CORCONDIA") + theme(text=element_text(size=16))
#
# stability_tongue = parafac4microbiome::assessModelStability(processedTongue, maxNumComponents=3, numFolds=10, colourCols=colourCols, legendTitles=legendTitles, xLabels = xLabels, legendColNums=legendColNums, arrangeModes=arrangeModes, numCores=parallel::detectCores())
# stability_tongue$modelPlots[[2]]
# stability_tongue$modelPlots[[3]]
#
# cp_tongue = parafac4microbiome::parafac(processedTongue$data, nfac=2, nstart=100)
cp_tongue = readRDS("./GOHTRANS/cp_tongue.RDS")
cp_tongue$varExp## [1] 23.00316
Therefore, a two-component CP model was selected, explaining 23.0% of the variation. However, MLR analysis revealed that both components were uninterpretable.
testMetadata(cp_tongue, 1, processedTongue)## Joining with `by = join_by(subject, GenderID)`
## Term Estimate CI P_value
## 1 GenderID 121.4677838 -2.46115922752504e-05 – 0.242960179217603 0.05004334
## 2 Age 1.2397817 -0.0120682478356951 – 0.014547811241472 0.85003536
## 3 BOP 2.4968042 -0.00497023980355535 – 0.00996384822005768 0.49901353
## 4 CODS 0.5381136 -0.0119608431733796 – 0.0130370704610451 0.93035385
## 5 XI -4.9945589 -0.0188281123330526 – 0.0088389945248416 0.46571634
## P_adjust
## 1 0.2502167
## 2 0.9303539
## 3 0.8316892
## 4 0.9303539
## 5 0.8316892
testMetadata(cp_tongue, 2, processedTongue)## Joining with `by = join_by(subject, GenderID)`
## Term Estimate CI P_value
## 1 GenderID 59.914254 -0.0657307291997982 – 0.185559237353633 0.3370374
## 2 Age -1.942659 -0.0157055547933125 – 0.0118202363189334 0.7746069
## 3 BOP -1.262477 -0.00898474294192472 – 0.00645978948107809 0.7402082
## 4 CODS -4.893111 -0.0178192803655215 – 0.0080330573884708 0.4445972
## 5 XI -6.251158 -0.0205575397778675 – 0.00805522373616812 0.3783908
## P_adjust
## 1 0.7409954
## 2 0.7746069
## 3 0.7746069
## 4 0.7409954
## 5 0.7409954
Salivary microbiome
CP was applied to capture the dominant source of variation in the salivary microbiome data block. The model selection procedure revealed that CORCONDIA values were too low at three components. The commented code below performs the procedure, and is not rerun here due to computational limitations.
# assessment_saliva = parafac4microbiome::assessModelQuality(processedSaliva$data, numRepetitions=10, numCores=10)
# assessment_saliva$plots$overview # def 2
# b = assessment_saliva$metrics$CORCONDIA %>% as_tibble() %>% mutate(index=1:nrow(.)) %>% pivot_longer(-index) %>% ggplot(aes(x=as.factor(name),y=value)) + geom_boxplot() + xlab("Number of components") + ylab("CORCONDIA") + theme(text=element_text(size=16))
#
# stability_saliva = parafac4microbiome::assessModelStability(processedSaliva, maxNumComponents=3, numFolds=10, colourCols=colourCols, legendTitles=legendTitles, xLabels = xLabels, legendColNums=legendColNums, arrangeModes=arrangeModes, numCores=parallel::detectCores())
# stability_saliva$modelPlots[[2]]
# stability_saliva$modelPlots[[2]]
#
# cp_saliva = parafac4microbiome::parafac(processedSaliva$data, nfac=2, nstart=100)
cp_saliva = readRDS("./GOHTRANS/cp_saliva.RDS")
cp_saliva$varExp## [1] 18.86278
Therefore, a two-component CP model was selected, explaining 18.9% of the variation. However, MLR analysis revealed that both components were uninterpretable.
testMetadata(cp_saliva, 1, processedSaliva)## Joining with `by = join_by(subject, GenderID)`
## Term Estimate CI P_value
## 1 GenderID -45.0221163 -0.178052895480722 – 0.0880086629409456 0.4938634
## 2 Age 3.0529562 -0.0115189644032717 – 0.0176248767956416 0.6710904
## 3 BOP -1.6631517 -0.00983935629840586 – 0.00651305291998757 0.6800939
## 4 CODS -0.9060191 -0.0145920276329919 – 0.0127799895072467 0.8931035
## 5 XI 1.7175717 -0.0134297828369515 – 0.0168649263219845 0.8180171
## P_adjust
## 1 0.8931035
## 2 0.8931035
## 3 0.8931035
## 4 0.8931035
## 5 0.8931035
testMetadata(cp_saliva, 2, processedSaliva)## Joining with `by = join_by(subject, GenderID)`
## Term Estimate CI P_value
## 1 GenderID 118.0788404 -0.00224170005304913 – 0.238399380941789 0.05412757
## 2 Age 6.5090792 -0.00667058769451647 – 0.0196887461831477 0.32036701
## 3 BOP 3.0035619 -0.00439145876278687 – 0.0103985825138551 0.41245932
## 4 CODS 0.4777822 -0.0119006159669152 – 0.0128561802899763 0.93754365
## 5 XI 0.3906881 -0.0133094337962947 – 0.0140908099806615 0.95383322
## P_adjust
## 1 0.2706379
## 2 0.6874322
## 3 0.6874322
## 4 0.9538332
## 5 0.9538332
Salivary cytokines
CP was applied to capture the dominant source of variation in the salivary cytokine data block. The model selection procedure revealed that CORCONDIA values were too low at three components. The commented code below performs the procedure, and is not rerun here due to computational limitations.
# assessment_cytokines = parafac4microbiome::assessModelQuality(processedCytokines$data, numRepetitions=10, numCores=10)
# assessment_cytokines$plots$overview # def 2
# c = assessment_cytokines$metrics$CORCONDIA %>% as_tibble() %>% mutate(index=1:nrow(.)) %>% pivot_longer(-index) %>% ggplot(aes(x=as.factor(name),y=value)) + geom_boxplot() + xlab("Number of components") + ylab("CORCONDIA") + theme(text=element_text(size=16))
#
# stability_cytokines = parafac4microbiome::assessModelStability(processedCytokines, maxNumComponents=3, numFolds=10, colourCols=colourCols, legendTitles=legendTitles, xLabels = xLabels, legendColNums=legendColNums, arrangeModes=arrangeModes, numCores=parallel::detectCores())
# stability_cytokines$modelPlots[[2]]
# stability_cytokines$modelPlots[[3]]
#
# cp_cytokines = parafac4microbiome::parafac(processedCytokines$data, nfac=2, nstart=100)
cp_cytokines = readRDS("./GOHTRANS/cp_cytokines.RDS")
cp_cytokines$varExp## [1] 55.68743
Therefore, a two-component CP model was selected, explaining 55.7% of the variation. However, MLR analysis revealed that both components were uninterpretable.
testMetadata(cp_cytokines, 1, processedCytokines)## Joining with `by = join_by(subject, GenderID)`
## Term Estimate CI P_value
## 1 GenderID -27.05017814 -0.251532298438523 – 0.197431942164215 0.8007915
## 2 Age 16.67084513 -0.00723963736661261 – 0.0405813276341731 0.1579702
## 3 BOP -0.09082505 -0.0164640352183346 – 0.0162823851145396 0.9907222
## 4 CODS -28.95668163 -0.402598657967525 – 0.34468529471301 0.8710047
## 5 XI -15.99762836 -0.0429168315278513 – 0.0109215747988014 0.2245843
## P_adjust
## 1 0.9907222
## 2 0.5614609
## 3 0.9907222
## 4 0.9907222
## 5 0.5614609
testMetadata(cp_cytokines, 2, processedCytokines)## Joining with `by = join_by(subject, GenderID)`
## Term Estimate CI P_value
## 1 GenderID 162.363038 -0.0550666388070594 – 0.379792714707578 0.1323173
## 2 Age -6.714968 -0.029874266224001 – 0.0164443312143381 0.5458402
## 3 BOP -10.075196 -0.0259340174258618 – 0.00578362484505684 0.1957476
## 4 CODS 95.954957 -0.265948494090199 – 0.457858408923401 0.5803389
## 5 XI -1.713409 -0.0277869044719951 – 0.02436008731968 0.8904703
## P_adjust
## 1 0.4893690
## 2 0.7254237
## 3 0.4893690
## 4 0.7254237
## 5 0.8904703
Salivary biochemistry
CP was applied to capture the dominant source of variation in the salivary biochemistry data block. The model selection procedure revealed that CORCONDIA values were inconclusive and loadings were unstable at three components. The commented code below performs the procedure, and is not rerun here due to computational limitations.
# assessment_biochemistry = parafac4microbiome::assessModelQuality(processedBiochemistry$data, numRepetitions=10, numCores=10)
# assessment_biochemistry$plots$overview # 2 or 3?
#
# d = assessment_biochemistry$metrics$CORCONDIA %>% as_tibble() %>% mutate(index=1:nrow(.)) %>% pivot_longer(-index) %>% ggplot(aes(x=as.factor(name),y=value)) + geom_boxplot() + xlab("Number of components") + ylab("CORCONDIA") + theme(text=element_text(size=16))
# stability_biochemistry = parafac4microbiome::assessModelStability(processedBiochemistry, maxNumComponents=3, numFolds=10, colourCols=colourCols, legendTitles=legendTitles, xLabels = xLabels, legendColNums=legendColNums, arrangeModes=arrangeModes, numCores=parallel::detectCores())
# stability_biochemistry$modelPlots[[2]]
# stability_biochemistry$modelPlots[[3]]
# cp_biochemistry = parafac4microbiome::parafac(processedBiochemistry$data, nfac=2, nstart=100)
cp_biochemistry = readRDS("./GOHTRANS/cp_biochemistry.RDS")
cp_biochemistry$varExp## [1] 35.60952
Therefore, a two-component CP model was selected, explaining 35.6% of the variation. However, MLR analysis revealed that both components were uninterpretable.
testMetadata(cp_biochemistry, 1, processedBiochemistry)## Joining with `by = join_by(subject, GenderID)`
## Term Estimate CI P_value
## 1 GenderID 81.395726 -0.0343343553682499 – 0.197125807465893 0.16076105
## 2 Age 3.177143 -0.00949969477808753 – 0.0158539800238245 0.61171034
## 3 BOP 6.688526 -0.000424360420959855 – 0.0138014124523289 0.06428731
## 4 CODS 4.938192 -0.00696794674040549 – 0.0168443303204457 0.40275497
## 5 XI 8.405007 -0.00477242944168497 – 0.0215824425531161 0.20199880
## P_adjust
## 1 0.3366647
## 2 0.6117103
## 3 0.3214366
## 4 0.5034437
## 5 0.3366647
testMetadata(cp_biochemistry, 2, processedBiochemistry)## Joining with `by = join_by(subject, GenderID)`
## Term Estimate CI P_value
## 1 GenderID -1.5487722 -0.106748335420374 – 0.103650791078514 0.9761557
## 2 Age -0.3542818 -0.0118776277377812 – 0.0111690640408018 0.9502317
## 3 BOP -4.8835426 -0.0113492128200526 – 0.00158212771674754 0.1330520
## 4 CODS -5.6334319 -0.0164562063991478 – 0.00518934258952479 0.2954278
## 5 XI -6.9012715 -0.018879665442852 – 0.00507712253655297 0.2478597
## P_adjust
## 1 0.9761557
## 2 0.9761557
## 3 0.4923796
## 4 0.4923796
## 5 0.4923796