MAFLD and Physical Activity
Table of Contents
Brief
This analysis examines the relationship between physical activity patterns and metabolic dysfunction-associated fatty liver disease (MAFLD) in adolescents aged 12-19 years using NHANES data from 2003-2006 and 2011-2014.
We use accelerometer data to objectively measure physical activity across different times of day (overall, daytime 7:00-19:00, and nighttime). MAFLD is diagnosed using the Hepatic Steatosis Index combined with metabolic risk factors. We employ survey-weighted logistic regression to account for the complex sampling design of NHANES.
Sample selection flow
Preparation
This section sets up the required R packages and creates necessary directories for data storage.
# Required R packages
library(foreign)
library(dplyr)
library(survey)
library(haven)
library(tidyr)
library(gtsummary)
library(ggplot2)
library(gridExtra)
options(timeout = 1000)
# Source custom functions
# You can either source directly from URL or download the file first
# Option 1: Source from URL
source("https://zhanghaoyang.net/project/mafld_pa/functions.r")
# Option 2: Download and source locally
# source("functions.r")
# Create folder if it doesn't exist
dir.create("data/raw", showWarnings = FALSE, recursive = TRUE)
dir.create("result", showWarnings = FALSE, recursive = TRUE)
dir.create("plot", showWarnings = FALSE, recursive = TRUE)
Demographic Data
This section collects and processes demographic data from NHANES, including age, sex, race, education, and family income.
# NHANES cycle metadata
cycles <- data.frame(
code = c("C","D","E","F","G","H","I"),
year = c(2003,2005,2007,2009,2011,2013,2015),
label = c("2003-2004","2005-2006","2007-2008","2009-2010",
"2011-2012","2013-2014","2015-2016"),
stringsAsFactors = FALSE
)
# Note: load_xpt() and get_nhanes_data() functions are sourced from functions.r
# They handle downloading/loading NHANES XPT files and extracting variables
# DEMO (C, D, G, H) with year labels and renaming
DEMO <- bind_rows(lapply(c("C","D","G","H"), function(code) {
row <- cycles[cycles$code == code, ]
df <- load_xpt(row$year, paste0("DEMO_", code))
df <- df[, c("SEQN","RIAGENDR","RIDAGEYR","RIDRETH1","DMDEDUC2",
"INDFMPIR","WTINT2YR","WTMEC2YR","SDMVPSU","SDMVSTRA")]
df$year <- row$label
df
})) %>%
rename(Sex = RIAGENDR,
Age = RIDAGEYR,
Race = RIDRETH1,
Education = DMDEDUC2,
PIR = INDFMPIR)
# Other datasets
BIOPRO <- get_nhanes_data(c("L40_C","BIOPRO_D","BIOPRO_E","BIOPRO_F","BIOPRO_G","BIOPRO_H","BIOPRO_I"), c("SEQN","LBXSATSI","LBXSASSI")) %>%
rename(ALT = LBXSATSI,
AST = LBXSASSI)
BMI <- get_nhanes_data(c("BMX_C","BMX_D","BMX_E","BMX_F","BMX_G","BMX_H","BMX_I"), c("SEQN","BMXBMI","BMXWAIST")) %>%
rename(BMI = BMXBMI,
Waist_circumference = BMXWAIST)
BPX_raw <- get_nhanes_data(c("BPX_C","BPX_D","BPX_E","BPX_F","BPX_G","BPX_H","BPX_I"), c("SEQN","BPXSY1","BPXSY2","BPXSY3","BPXDI1","BPXDI2","BPXDI3"))
BPX <- BPX_raw %>%
mutate(
SBP = rowMeans(select(., BPXSY1, BPXSY2, BPXSY3), na.rm = TRUE),
DBP = rowMeans(select(., BPXDI1, BPXDI2, BPXDI3), na.rm = TRUE)
) %>%
select(SEQN, SBP, DBP)
TRIGLY <- get_nhanes_data(c("L13AM_C","TRIGLY_D","TRIGLY_E","TRIGLY_F","TRIGLY_G","TRIGLY_H","TRIGLY_I"), c("SEQN","LBXTR")) %>%
rename(TG = LBXTR)
HDL_raw <- get_nhanes_data(c("l13_c","HDL_D","HDL_E","HDL_F","HDL_G","HDL_H","HDL_I"), c("SEQN","LBXHDD","LBDHDD"))
HDL <- HDL_raw %>%
mutate(HDL = coalesce(LBXHDD, LBDHDD)) %>%
select(SEQN, HDL)
GLU <- get_nhanes_data(c("L10AM_C","GLU_D","GLU_E","GLU_F","GLU_G","GLU_H","GLU_I"), c("SEQN","LBXGLU")) %>%
rename(Glucose = LBXGLU)
GHB <- get_nhanes_data(c("L10_C","GHB_D","GHB_E","GHB_F","GHB_G","GHB_H","GHB_I"), c("SEQN","LBXGH")) %>%
rename(HbA1c = LBXGH)
DIQ <- get_nhanes_data(c("DIQ_C","DIQ_D","DIQ_E","DIQ_F","DIQ_G","DIQ_H","DIQ_I"), c("SEQN","DIQ010")) %>%
rename(diabetes_questionnaire = DIQ010)
# Final merged dataset
df <- DEMO %>%
left_join(BIOPRO, by = "SEQN") %>%
left_join(BMI, by = "SEQN") %>%
left_join(BPX, by = "SEQN") %>%
left_join(TRIGLY, by = "SEQN") %>%
left_join(HDL, by = "SEQN") %>%
left_join(GLU, by = "SEQN") %>%
left_join(GHB, by = "SEQN") %>%
left_join(DIQ, by = "SEQN")
write.csv(df, file = "data/data_demo.csv", row.names = FALSE)
Physical Activity Data
This section processes physical activity data from NHANES accelerometer data for two time periods (2003-2006 and 2011-2014). For each participant, it calculates average physical activity intensity for each hour of the day, then summarizes activity separately for daytime (7:00-19:00) and nighttime periods.
Note: The physical activity XPT files need to be manually downloaded from the NHANES website and placed in the data/raw/ directory before running this code.
You can download directly from these links:
PAXRAW_C.zip,
PAXRAW_D.zip,
PAXHR_G.xpt,
PAXHR_H.xpt.
After downloading, unzip, place all XPT files in data/raw/.
# 2003-2006, PA in minutes
pa_03_06_raw <- bind_rows(
read_xpt("data/raw/paxraw_c.xpt"),
read_xpt("data/raw/paxraw_d.xpt")
) %>%
select(SEQN, PAXINTEN, PAXHOUR, PAXN) %>%
filter(PAXN <= 10080) %>%
mutate(Day = paste0("day", ceiling(PAXN / 1440)))
# Sum PA in minutes to hours
pa_03_06_raw <- pa_03_06_raw %>%
group_by(SEQN, PAXHOUR, Day) %>%
summarise(PAXINTEN = sum(PAXINTEN, na.rm = TRUE), .groups = 'drop')
# Keep SEQNs that have all 7 days with exactly 24 hourly records each
valid_seqn <- pa_03_06_raw %>%
count(SEQN, Day) %>%
filter(n == 24) %>%
distinct(SEQN) %>%
pull(SEQN)
# Filter valid participants
pa_03_06_raw <- pa_03_06_raw %>%
filter(SEQN %in% valid_seqn)
# Calculate hourly PA for each SEQN
pa_03_06_hourly <- pa_03_06_raw %>%
group_by(SEQN, PAXHOUR) %>%
summarise(
PAXINTEN = mean(PAXINTEN, na.rm = TRUE),
.groups = 'drop'
) %>%
pivot_wider(
names_from = PAXHOUR,
names_prefix = "pa_",
values_from = PAXINTEN
)
# Filter valid participants
pa_03_06 <- pa_03_06_raw %>%
group_by(SEQN) %>%
summarise(
mean_pa_day = mean(PAXINTEN[between(PAXHOUR, 7, 19)], na.rm = TRUE),
mean_pa_night = mean(PAXINTEN[!between(PAXHOUR, 7, 19)], na.rm = TRUE),
mean_pa = mean(PAXINTEN, na.rm = TRUE)
)
# Join hourly data with summary data
pa_03_06 <- pa_03_06 %>%
left_join(pa_03_06_hourly, by = "SEQN")
write.csv(pa_03_06, file = "data/pa_2003_2006.csv", row.names = FALSE)
# 2011-2014, PA hourly
pa_11_14_raw <- bind_rows(
read_xpt("data/raw/PAXHR_G.xpt") %>% select(SEQN, PAXDAYH, PAXTMH, PAXMTSH),
read_xpt("data/raw/PAXHR_H.xpt") %>% select(SEQN, PAXDAYH, PAXTMH, PAXMTSH)
) %>%
filter(PAXTMH == 60, PAXDAYH >= 2, PAXDAYH <= 8) %>%
group_by(SEQN, PAXDAYH) %>%
filter(n() == 24) %>%
mutate(hour = (row_number() - 1) %% 24) %>%
group_by(SEQN) %>%
filter(n_distinct(PAXDAYH) == length(unique(PAXDAYH)))
# Calculate hourly PA for each SEQN
pa_11_14_hourly <- pa_11_14_raw %>%
group_by(SEQN, hour) %>%
summarise(
PAXMTSH = mean(PAXMTSH, na.rm = TRUE),
.groups = 'drop'
) %>%
pivot_wider(
names_from = hour,
names_prefix = "pa_",
values_from = PAXMTSH
)
pa_11_14 <- pa_11_14_raw %>%
summarise(
mean_pa_day = mean(PAXMTSH[between(hour, 7, 19)], na.rm = TRUE),
mean_pa_night = mean(PAXMTSH[!between(hour, 7, 19)], na.rm = TRUE),
mean_pa = mean(PAXMTSH, na.rm = TRUE)
)
# Join hourly data with summary data
pa_11_14 <- pa_11_14 %>%
left_join(pa_11_14_hourly, by = "SEQN")
write.csv(pa_11_14, file = "data/pa_2011_2014.csv", row.names = FALSE)
# Combine physical activity data
pa_03_06 = read.csv("data/pa_2003_2006.csv", stringsAsFactors = FALSE)
pa_11_14 = read.csv("data/pa_2011_2014.csv", stringsAsFactors = FALSE)
# Note: categorize_pa() function is sourced from functions.r
# It categorizes physical activity into quartiles: Low, Medium, High, Very High
# Process data for both periods
pa_03_06 <- pa_03_06 %>%
mutate(
physical_activity = categorize_pa(., "mean_pa"),
physical_activity_day = categorize_pa(., "mean_pa_day"),
physical_activity_night = categorize_pa(., "mean_pa_night")
)
pa_11_14 <- pa_11_14 %>%
mutate(
physical_activity = categorize_pa(., "mean_pa"),
physical_activity_day = categorize_pa(., "mean_pa_day"),
physical_activity_night = categorize_pa(., "mean_pa_night")
)
# Combine the processed data
pa <- bind_rows(pa_03_06, pa_11_14)
write.csv(pa, file = "data/pa_combined.csv", row.names = FALSE)
Combine Data
This section merges demographic and physical activity data, creating a comprehensive dataset for analysis.
df <- read.csv("data/data_demo.csv", stringsAsFactors = FALSE)
pa = read.csv("data/pa_combined.csv", stringsAsFactors = FALSE)
df <- left_join(df, pa, by = 'SEQN')
Diagnosis
This section defines and calculates diagnostic criteria for key health conditions in adolescents. Diabetes is diagnosed using fasting glucose ≥126 mg/dL, HbA1c ≥6.5%, or self-reported diagnosis. Obesity is determined using age- and sex-specific BMI thresholds and waist circumference criteria. Metabolic syndrome is defined using age-specific criteria for blood pressure, triglycerides, HDL cholesterol, and TG/HDL ratio. MAFLD (Metabolic dysfunction-Associated Fatty Liver Disease) is diagnosed using the Hepatic Steatosis Index (HSI ≥36) plus presence of diabetes, obesity, or metabolic syndrome.
# Diabetes diagnosis (>126mg/dl)
df$Diabetes <- ifelse(
is.na(df$diabetes_questionnaire) & is.na(df$Glucose) & is.na(df$HbA1c), NA,
ifelse(df$diabetes_questionnaire == 1, 1,
ifelse(df$diabetes_questionnaire == 2, 0,
ifelse(df$Glucose >= 126, 1,
ifelse(df$HbA1c >= 6.5, 1, 0)
)
)
)
)
# HSI calculation and assessment
df <- df %>%
mutate(
Sex = ifelse(Sex == 2, 1, ifelse(Sex == 1, 0, Sex)),
HSI = 8 * (ALT / AST) + BMI + 2 * Diabetes + 2 * Sex,
HSI_assessment = ifelse(HSI >= 36, 1, 0)
)
# Obesity diagnosis
bmi_medians <- data.frame(
Age = 6:19,
Boy_SD = c(16.9,17.2,17.6,18.1,18.8,19.5,20.3,21.2,22.2,23.0,23.8,24.5,25.1,25.4),
Girl_SD = c(17.1,17.5,18.0,18.6,19.4,20.2,21.2,22.2,23.1,23.8,24.3,24.6,24.9,25.0)
)
thresholds <- data.frame(
Age = rep(6:19, each = 2),
Sex = rep(c(1, 0), times = 14),
waist_threshold = c(61.2,63.2,60.2,63.6,62.5,66.8,65.1,70.0,67.8,73.1,70.4,75.6,72.6,77.4,
74.0,78.6,74.9,79.6,75.5,80.5,75.8,81.3,76.0,82.1,76.1,83.0,76.1,83.0)
)
df <- df %>%
left_join(bmi_medians, by = "Age") %>%
left_join(thresholds, by = c("Age", "Sex")) %>%
mutate(
obesity1 = ifelse(Sex == 0 & BMI > Boy_SD | Sex == 1 & BMI > Girl_SD, "yes", "no"),
obesity2 = ifelse(Waist_circumference >= waist_threshold, "yes", "no"),
Obesity = ifelse(obesity1 == "yes" | obesity2 == "yes", 1, 0)
) %>%
select(-Boy_SD, -Girl_SD, -waist_threshold, -obesity1, -obesity2)
# Metabolic syndrome diagnosis
df <- df %>%
mutate(
condition_count = rowSums(
cbind(
TG > 100, # Triglycerides > 100mg/dL
HDL < 38.67, # HDL < 38.67
SBP > 120, # Systolic BP > 120 mmHg
DBP > 80, # Diastolic BP > 80 mmHg
(TG / HDL) > 2.25 # TG/HDL ratio > 2.25
),
na.rm = TRUE
),
condition_count_10_19 = rowSums(
cbind(
SBP > 130, # Systolic BP > 130 mmHg
DBP > 85, # Diastolic BP > 85 mmHg
TG > 150, # Triglycerides > 150 mg/dL
HDL < 40, # HDL < 40 mg/dL
(TG / HDL) > 2.25 # TG/HDL ratio > 2.25
),
na.rm = TRUE
),
metabolic_syndrome = ifelse(
(Age >= 2 & Age <= 9 & condition_count >= 2) |
(Age >= 10 & Age <= 19 & condition_count_10_19 >= 2),
1, 0
)
) %>%
select(-condition_count, -condition_count_10_19)
# MAFLD diagnosis
df$MAFLD <- ifelse(df$HSI_assessment == 1 & (df$Diabetes == 1 | df$Obesity == 1 | df$metabolic_syndrome == 1), 1, 0)
# Save dataset
write.csv(df, file = "data/data_withDiagnosis.csv", row.names = FALSE)
Data Cleaning
This section filters the data to include only adolescents aged 12-19 years, recodes categorical variables with meaningful labels, and removes incomplete cases. The cleaned dataset combines both time periods (2003-2006 and 2011-2014) for analysis.
df = read.csv("data/data_withDiagnosis.csv", stringsAsFactors = FALSE)
# First convert Age to numeric for filtering
df = df %>%
mutate(Age = as.numeric(Age))
# Filter age first
df = df %>%
filter(Age >= 12 & Age <= 19)
# Then do the rest of the transformations
df = df %>%
mutate(
year = ifelse(year %in% c("2003-2004","2005-2006"), "2003-2006", "2011-2014"),
Sex = recode(Sex, '0' = "Male", '1' = "Female"),
Race = case_when(
Race == 1 ~ 'Mexican',
Race == 3 ~ 'White',
Race == 4 ~ 'Black',
TRUE ~ 'Other'
) %>% factor(levels = c('Mexican', 'White', 'Black', 'Other')),
Family_income = case_when(
PIR <= 1.3 ~ "Low",
PIR > 1.3 & PIR <= 3.5 ~ "Medium",
TRUE ~ "High"
) %>% factor(levels = c('Low', 'Medium', 'High')),
Diabetes = recode(Diabetes, '1' = "yes", '0' = "no") %>% factor(levels = c("no", "yes")),
Obesity = recode(Obesity, '1' = "yes", '0' = "no") %>% factor(levels = c("no", "yes")),
Sex = factor(Sex, levels = c('Male', 'Female')),
physical_activity = factor(physical_activity, levels = c("Low", "Medium", "High", "Very High"), ordered = TRUE),
physical_activity_day = factor(physical_activity_day, levels = c("Low", "Medium", "High", "Very High"), ordered = TRUE),
physical_activity_night = factor(physical_activity_night, levels = c("Low", "Medium", "High", "Very High"), ordered = TRUE)
)
# Check final age distribution
print("Age distribution after filtering:")
table(df$Age, df$year)
# Filter complete cases
df = df %>% filter(complete.cases(MAFLD, physical_activity))
# Save dataset
save(df, file = "data/data_clean.RData")
Baseline Characteristics
This section generates descriptive statistics for the study population, comparing participants with and without MAFLD.
Tables are created separately for each time period (2003-2006 and 2011-2014) using the gtsummary package,
with statistical tests comparing groups (t-tests for continuous variables, chi-square tests for categorical variables).
load("data/data_clean.RData")
for (range in c("2003-2006", "2011-2014")) {
data <- df %>% filter(year == range)
tbl <- data %>%
select(
MAFLD, Sex, Age, Race, Family_income,
BMI, ALT, AST, SBP, DBP,
Diabetes, Obesity, physical_activity
) %>%
tbl_summary(
by = MAFLD,
type = list(
c(Age, BMI, ALT, AST, SBP, DBP) ~ "continuous"
),
statistic = list(
all_continuous() ~ "{mean} ({sd})",
all_categorical() ~ "{n} ({p}%)"
),
missing = "no"
) %>%
add_p() %>%
add_overall()
# Convert to data.frame and write to CSV
tbl_df <- as_tibble(as.data.frame(tbl))
write.csv(tbl_df, file = paste0("result/baseline_", range, ".csv"), row.names = FALSE)
}
Regression Analysis
This section performs regression analysis to examine the association between physical activity and MAFLD.
The analysis uses survey-weighted logistic regression with three models: unadjusted (Model 1), age and sex adjusted (Model 2),
and fully adjusted for age, sex, race, and income (Model 3). All regression functions (build_formula(),
calculate_ci(), format_results(), and perform_regression()) are sourced from functions.r.
# ===============================================
# Regression Analysis
# ===============================================
# Note: All regression functions are sourced from functions.r:
# - build_formula(): Constructs regression formulas
# - calculate_ci(): Calculates confidence intervals using Wald method
# - format_results(): Formats regression results with ORs and CIs
# - perform_regression(): Main function that runs three models
load("data/data_clean.RData")
# Run analysis for each year period
results_list <- lapply(c("2003-2006", "2011-2014"), function(period) {
# Filter data for the period
df_period <- df %>% filter(year == period)
# Create survey design for the period
nhs <- svydesign(data=df_period, ids=~SDMVPSU, strata = ~SDMVSTRA,
weights=~WTINT2YR, nest = TRUE)
# Run analysis for all activity variables
results <- do.call(rbind, lapply(c("physical_activity", "physical_activity_day", "physical_activity_night"),
function(var) {
res <- perform_regression(var, nhs)
res$Activity <- rep(c("All", "Day", "Night")[match(var, c("physical_activity", "physical_activity_day", "physical_activity_night"))],
nrow(res))
res$Year <- period
res
}))
})
# Combine results from both periods
results <- do.call(rbind, results_list)
# Save results
write.csv(results, file = "result/regression.csv", row.names = FALSE)
# ===============================================
# Regression Analysis for subgroup
# ===============================================
# Note: perform_subgroup_regression() function is sourced from functions.r
# Some subgroups use regular logistic regression instead of survey-weighted analysis
# due to insufficient PSUs (Primary Sampling Units) within strata after stratification.
# The function automatically falls back to regular logistic regression when needed.
# age group
df = df %>%
mutate(
Age_group = cut(Age,
breaks = c(12, 15, 17, 19),
labels = c("12-14", "15-16", "17-19"),
include.lowest = TRUE)
)
# Run subgroup analysis for each year period
subgroup_results_list <- lapply(c("2003-2006", "2011-2014"), function(period) {
# Filter data for the period
df_period <- df %>% filter(year == period)
# Run analysis for each subgroup variable
subgroup_vars <- c("Age_group", "Sex", "Race", "PIR")
results <- lapply(subgroup_vars, function(subgroup_var) {
# Get unique values of subgroup variable
if (subgroup_var == "Age_group") {
subgroups <- levels(df_period$Age_group)
} else {
subgroups <- levels(df_period[[subgroup_var]])
}
# Run analysis for each subgroup
subgroup_results <- lapply(subgroups, function(subgroup) {
# Filter data for the subgroup
if (subgroup_var == "Age_group") {
subgroup_data <- df_period[df_period$Age_group == subgroup, ]
} else {
subgroup_data <- df_period[df_period[[subgroup_var]] == subgroup, ]
}
# Skip if subgroup is too small
if (nrow(subgroup_data) < 20) {
print(paste("Skipping subgroup", subgroup, "due to small sample size"))
return(NULL)
}
# Run analysis for all activity variables
tryCatch({
activity_results <- do.call(rbind, lapply(c("physical_activity", "physical_activity_day", "physical_activity_night"),
function(var) {
res <- perform_subgroup_regression(var, subgroup_var, subgroup_data)
res$Activity <- rep(c("All", "Day", "Night")[match(var, c("physical_activity", "physical_activity_day", "physical_activity_night"))],
nrow(res))
res$Subgroup <- as.character(subgroup)
res$Subgroup_Var <- subgroup_var
res
}))
}, error = function(e) {
print(paste("Error in subgroup", subgroup, ":", e$message))
return(NULL)
})
})
# Remove NULL results
subgroup_results <- subgroup_results[!sapply(subgroup_results, is.null)]
if (length(subgroup_results) == 0) return(NULL)
do.call(rbind, subgroup_results)
})
# Remove NULL results
results <- results[!sapply(results, is.null)]
if (length(results) == 0) return(NULL)
results_df <- do.call(rbind, results)
results_df$Year <- period
results_df
})
# Remove NULL results and combine
subgroup_results_list <- subgroup_results_list[!sapply(subgroup_results_list, is.null)]
subgroup_results <- do.call(rbind, subgroup_results_list)
# Save results
write.csv(subgroup_results, file = "result/regression_subgroup.csv", row.names = FALSE)
Visualization
This section creates visualizations to explore the relationship between physical activity and MAFLD.
The plotting functions (create_violin_plot() and my_theme) are sourced from functions.r.
# ===============================================
# Plot: Hourly physical activity patterns
# ===============================================
# For visualization purposes, values above the 99th percentile were excluded.
# Reshape data for plotting
df_long <- df %>%
select(SEQN, year, MAFLD, starts_with("pa_")) %>%
pivot_longer(
cols = starts_with("pa_"),
names_to = "hour",
values_to = "mean_physical_activity"
) %>%
mutate(hour = as.numeric(gsub("pa_", "", hour)))
# Calculate 99th percentile for each period and MAFLD status
quantiles <- df_long %>%
group_by(year, MAFLD) %>%
summarise(q99 = quantile(mean_physical_activity, 0.99, na.rm = TRUE), .groups = "drop")
# Create violin plots for both periods
violin_plots <- list(
# 2003-2006 (no x-label, no legend)
"2003_2006" = create_violin_plot(
filter(df_long, year == "2003-2006",
mean_physical_activity <= filter(quantiles, year == "2003-2006", MAFLD == ifelse(MAFLD == 1, 1, 0))$q99),
"2003-2006", show_x_label = FALSE, show_legend = FALSE),
# 2011-2014 (show x-label and legend)
"2011_2014" = create_violin_plot(
filter(df_long, year == "2011-2014",
mean_physical_activity <= filter(quantiles, year == "2011-2014", MAFLD == ifelse(MAFLD == 1, 1, 0))$q99),
"2011-2014", show_x_label = TRUE, show_legend = TRUE)
)
# Combine violin plots
violin_combined <- grid.arrange(grobs = violin_plots, ncol = 1,
padding = unit(0.5, "lines"))
# ===============================================
# Plot: MAFLD proportion by physical activity level
# ===============================================
# Note: my_theme is sourced from functions.r
prop_data <- df %>%
pivot_longer(
cols = c(physical_activity, physical_activity_day, physical_activity_night),
names_to = "activity_type",
values_to = "activity_level"
) %>%
mutate(
time_period = case_when(
activity_type == "physical_activity" ~ "Overall",
activity_type == "physical_activity_day" ~ "Daytime",
activity_type == "physical_activity_night" ~ "Nighttime"
),
time_period = factor(time_period, levels = c("Overall", "Daytime", "Nighttime")),
activity_level = factor(activity_level, levels = c("Q1", "Q2", "Q3", "Q4"))
) %>%
group_by(year, time_period, activity_level) %>%
summarise(
total = n(),
mafld = sum(MAFLD == 1),
non_mafld = total - mafld,
.groups = "drop"
) %>%
pivot_longer(
cols = c(mafld, non_mafld),
names_to = "status",
values_to = "count"
) %>%
group_by(year, time_period, activity_level) %>%
mutate(
proportion = count / sum(count),
status = factor(status, levels = c("non_mafld", "mafld"),
labels = c("Non-MAFLD", "MAFLD"))
)
# Create proportion plot
proportion_plot <- ggplot(prop_data, aes(x = activity_level, y = proportion, fill = status)) +
geom_bar(stat = "identity", width = 0.8) +
geom_text(aes(label = scales::percent(proportion, accuracy = 1)),
position = position_stack(vjust = 0.5),
size = 5, color = "white") +
facet_grid(time_period ~ year) +
scale_fill_manual(values = c("Non-MAFLD" = "#4A9B7E", "MAFLD" = "#D2603A")) +
scale_y_continuous(labels = scales::percent_format(accuracy = 1),
breaks = seq(0, 1, 0.25)) +
labs(x = " Activity counts MIMS-units",
y = "Proportion",
fill = "Status") +
my_theme +
theme(plot.title = element_text(size = 18, hjust = 0.5, family = "Helvetica"))
# ===============================================
# Combine both plots into one plot
# ===============================================
# Combine all plots
combined_comprehensive_plot <- grid.arrange(
violin_combined,
proportion_plot,
ncol = 2,
widths = c(3, 2),
padding = unit(1, "lines")
)
# Save the combined plot
ggsave("plot/combined_pa_mafld_analysis.png", combined_comprehensive_plot,
width = 16, height = 6, dpi = 600, bg = "white")