Jinqiao cohort
Table of Contents
Brief
In 2022, Jiangnan University initialized a children cohort in China, Wuxi, namely Jinqiao cohort.We investigated potential novel association between MAFLD and certain lesser-explored biomarkers.
Published in BMC Gastroenterology. [paper]
Data
Contact me ([email protected]) or Dr. Le Zhang ([email protected]) to access the data.Data clean
Load packages and define functions:
libs <- c("dplyr", "tidyr", "ggplot2", "ggpubr", "ggsci", "psych", "ggcorrplot", "corrplot", "pROC", "plotROC")
lapply(libs, require, character.only = TRUE)
options(stringsAsFactors = F)
## extract element with pattern from vector
# e.g., get('a', c('AS', 'Ba', 'C')) ; get('a', c('AS', 'Ba', 'C'), exact=F)
get <- function(key, vec, exact = F, v = F) {
vec <- unlist(vec)
if (exact == T) {
out <- vec[grep(key, vec, invert = v)]
} else {
out <- vec[grep(toupper(key), toupper(vec), invert = v)]
}
return(out)
}
# calculate proportion of a given var for all, male, female participants
# e.g,
# temp = data.frame(sex=c('Male', 'Female'), MAFLD=c(1, 0))
# get_prop(temp, 'sex', 'MAFLD')
get_prop <- function(df, col, group_col) {
for (group in c("0|1", "1", "0")) {
sub <- df[grepl(group, df[, group_col]), ]
tab <- table(sub[, col])
frq <- data.frame(col = names(tab), n = as.numeric(tab))
frq <- frq %>% mutate(n = paste0(n, "(", sprintf("%.2f", n * 100 / sum(tab)), "%)"))
group1 <- case_when(group == "0|1" ~ "all", group == "1" ~ group_col, group == "0" ~ paste0("non-", group_col))
print(paste0("distribution of ", group1, " participants :"))
print(frq)
}
}
Clean the data (remove redundancy variables, and recodes the variables, etc):
# load df
load("path_of_data")
# inbody, drop right
df <- df[, !grepl("FFM%|.Right", names(df))]
names(df) <- gsub(".", "_", names(df), fixed = T)
names(df) <- gsub("_of_", "_", names(df), fixed = T)
names(df) <- gsub("Left_", "", names(df), fixed = T)
names(df) <- gsub("Circumference", "C", names(df), fixed = T)
names(df) <- gsub("Muscle_C", "MC", names(df))
names(df) = gsub("Fat_Thickness", "FT", names(df))
df = df %>%
mutate(BMI = as.numeric(weight) / ((as.numeric(height) / 100)^2), sex = ifelse(sex == "男", "m", "f"), Month = round(age * 12)) %>%
rename(Age = age, Sex = sex)
## recode
# bmi, orinal values according to who criteria
vars <- names(df)[which(names(df) == "SBP"):which(names(df) == "PDW")] # all vars
cat_vars <- c('Sex', "ERY", "URO", "PRO", "LEU", "VC", "UCA") # categorical
num_vars <- c('Age', vars[!vars %in% cat_vars]) # numeric
diagnosis_vars = c('BMI', 'SBP', 'DBP', 'GLU', 'TG', 'HDLC', 'C_Abdomen') # biomarkers used in diagnosed
## recode
# cat_vars, convert to orinal values
for (col in c("ERY", "URO", "PRO", "LEU", "VC", "UCA")) {
print(col)
cat("before recode:\n")
print(table(df[, col]))
var = df[, col]
if (col == "UCA") {
var1 = case_when(var == "<1.00" ~ 0, var == "2.5" ~ 1, var == "5" ~ 2)
} else if (col == "URO") {
var1 = case_when(var == "阴性" ~ 0, var == "阳性+" ~ 1, var == "阳性++" ~ 2)
} else {
var1 = case_when(var == "阴性" ~ 0, var == "弱阳性" ~ 1, var == "阳性+" ~ 2, var == "阳性++" ~ 3, var == "阳性+++" ~ 4)
}
df[, col] = var1
cat("after recode:\n")
print(table(df[, col]))
}
Characteristics description
Here we measure the distribution of categorical biomarkers by proportion and that of numeric ones by mean and sd.Distribution comparison was conducted according to data type.
## distribution description
# categorical biomarkers, n and proportion
for (col in c("Sex", "BMI", cat_vars)) {
print(col)
get_prop(df, col, group_col = "MAFLD")
}
# numeric biomarkers, mean, sd, iqr
describe(df[, num_vars])
describeBy(df[, num_vars], list(df$MAFLD))
## distribution comparison
# fisher test
for (var in cat_vars) {
print(var)
print(fisher.test(table(df[, var], df$MAFLD), simulate.p.value = TRUE)) # here change to fisher test
}
# wilcox test
for (var in num_vars) {
print(var)
shapiro = shapiro.test(df[, var])
if (shapiro$p.value < 0.05) {
test = wilcox.test(df[, var] ~ df$MAFLD)
} else {
test = t.test(df[, var] ~ df$MAFLD)
}
print(test)
}
Alternative biomarkers
Here we measured the association between diagnosis biomarker and other biomarker. If a biomarker is associated with diagnosis biomarker, it is an alternative biomarker but not a novel biomarker.
out = c()
for (i in diagnosis_vars){
for (j in vars[vars!=i]){
test = cor.test(df[,i], df[,j], use = "complete.obs")
cor = test$estimate
cor_p = test$p.value
out = c(out, i, j, cor, cor_p)
}
}
res = data.frame(matrix(out, ncol=4, byrow=T))
names(res) = c('diagnosis_var', 'other_var', 'cor', 'p')
res = res%>%mutate(cor=as.numeric(cor), p=as.numeric(p))
res1 = res%>%filter(p<0.05&abs(cor)>0.8)
plots <- list()
for (var in unique(res1$diagnosis_var)){
df_p = res%>%filter(diagnosis_var==!!var)%>%arrange(cor)%>%filter(p<0.05&cor>0.2)%>%
mutate(cor=abs(cor), diagnosis_var=gsub('_', ' of ', diagnosis_var), other_var=gsub('_', ' of ', other_var))
df_p = df_p%>%mutate(other_var=factor(other_var, levels=df_p$other_var))
ylab = paste0('Correlation coefficient with ', ifelse(var=='BMI', 'BMI', 'abdomen circumference'))
p = ggplot(df_p, aes(x=other_var, y=cor))+
geom_bar(stat="identity", width=0.7, fill="steelblue") +
xlab('') + ylab(ylab) +
theme(plot.title = element_text(size = 15, face = "bold", hjust = 0.5)) +
coord_flip() +
geom_hline(yintercept = 0.8, color = "black", linetype = 2) +
scale_fill_manual(values = c("steelblue", "green"), breaks = c(0, 1),
labels = c("Correlation < 0.8", "Correlation >= 0.8")) +
scale_y_continuous(limits = c(0, 1), breaks = seq(0, 1, 0.2))
plots[[var]] = p
}
png("plot/cor1.png", height = 800, width = 1200, res = 100)
ggarrange(plots[[1]], plots[[2]],
nrow = 1, ncol = 2, hjust = 0.1, vjust = 0.1)
dev.off()
# diagnosis_vars and vars associated with diagnosis_vars
drop_vars = unlist(res1[,1:2])
After excluding the above biomarkers (associated with diagnosis biomarker), we investigate pair-wise correlation witnin remainings:
vars = list()
vars[['inbody']] = c(names(df)[which(names(df) == "BFM"):which(names(df) == "FT_Thigh")]) # 31
vars[['blood_biochemical']] = c(names(df)[which(names(df) == "INS"):which(names(df) == "UREA/CREA")]) # 27
vars[['blood_composition']] = c(names(df)[which(names(df) == "WBC"):which(names(df) == "PDW")]) # 22
vars[['urine']] = c("URBC", "UWBC", "UPRO", "UPCR", "UCREA", "SG", "PH", "EC", "MUCS") # 9
plots <- list()
for (i in names(vars)) {
keep_col <- vars[[i]]
keep_col = keep_col[!keep_col%in%drop_vars]
print(i)
print(keep_col)
sub <- df[, keep_col]
mat_cor <- cor(sub)
mat_p <- corr.test(sub, adjust = "none")[["p"]]
p <- ggcorrplot(mat_cor,
p.mat = mat_p, type = "lower", hc.order = T, insig = "blank", outline.col = "white",
ggtheme = ggplot2::theme_gray) +
theme(axis.text.x = element_text(angle = 90, hjust = 1), legend.position='none')
plots[[i]] <- p
}
png("plot/cor_inbody.png", height = 800, width = 1800, res = 100)
ggarrange(plots[[1]], nrow = 1, ncol = 1, hjust = 0.1, vjust = 0.1, common.legend = T, legend = "bottom")
dev.off()
png("plot/cor2.png", height = 800, width = 1800, res = 100)
ggarrange(plots[[2]], plots[[3]], plots[[4]],
nrow = 1, ncol = 3, hjust = 0.1, vjust = 0.1,
common.legend = T, legend = "bottom"
)
dev.off()
Novel biomarkers
Here we identify novel biomarkers with remainnings in previous step.First, for each biomarker, we measure its association with MAFLD, including age and sex as covariates.
Then, for those with significant p-value in univariate analysis, we perform multivariates analysis for variable selection.
biomakers = unlist(vars)
biomakers = biomakers[!biomakers%in%c(drop_vars, diagnosis_vars)]
res <- data.frame()
for (biomaker in biomakers) {
reg <- glm(df$MAFLD ~ df[, biomaker] + df$Age + df$Sex, df, family = binomial()) # I add age and sex here.
coef <- data.frame(summary(reg)$coefficients)
coef <- coef[2, c(1, 2, 4)]
coef <- c(biomaker, coef)
names(coef) <- c("biomarker", "beta", "se", "p")
res <- rbind(res, coef)
}
vars <- unname(unlist(res %>% filter(p < 0.05) %>% select(biomarker)))
sub <- df[, c("MAFLD", vars)]
reg <- glm(MAFLD ~ ., family = binomial(), data = sub)
summary(reg)
reg1 <- step(reg)
coef1 <- data.frame(summary(reg1)$coefficients)
coef1 <- coef1[2:nrow(coef1), c(1, 2, 4)]
coef1 <- cbind(rownames(coef1), coef1)
names(coef1) <- c("biomarker", "beta", "se", "p")
row.names(coef1) <- NULL
# density plot
vars <- names(reg1$coefficients)[-1]
vars <- gsub("`", "", vars)
df_p <- df[, c(vars, "MAFLD")]
df_p <- df_p %>%
gather(variable, value, -MAFLD) %>%
mutate(MAFLD = as.character(MAFLD))
p <- ggplot(df_p, aes(x = value, group = MAFLD, fill = MAFLD)) +
geom_density(alpha = 0.5, , adjust = 0.3) +
facet_wrap(~variable, scales = "free") +
scale_y_continuous(labels = function(x) sprintf("%.1f", x)) +
xlab("") +
ylab("") +
theme(
legend.position = c(0.9, 0.1),
legend.box = "inside"
)
png("plot/density.png", height = 1000, width = 2000, res = 160)
print(p)
dev.off()