Haoyang Zhang


Histogram

MAFLD and Physical Activity

Table of Contents

Brief

This analysis examines the relationship between physical activity patterns and metabolic dysfunction-associated fatty liver disease (MAFLD) in adolescents aged 12-19 years using NHANES data from 2003-2006 and 2011-2014.

We use accelerometer data to objectively measure physical activity across different times of day (overall, daytime 7:00-19:00, and nighttime). MAFLD is diagnosed using the Hepatic Steatosis Index combined with metabolic risk factors. We employ survey-weighted logistic regression to account for the complex sampling design of NHANES.

Sample selection flow

Preparation

This section sets up the required R packages and creates necessary directories for data storage.

# Required R packages
library(foreign)
library(dplyr)
library(survey)
library(haven)
library(tidyr)
library(gtsummary)
library(ggplot2)
library(gridExtra)
options(timeout = 1000)

# Source custom functions
# You can either source directly from URL or download the file first
# Option 1: Source from URL
source("https://zhanghaoyang.net/project/mafld_pa/functions.r")

# Option 2: Download and source locally
# source("functions.r")

# Create folder if it doesn't exist
dir.create("data/raw", showWarnings = FALSE, recursive = TRUE)
dir.create("result", showWarnings = FALSE, recursive = TRUE)
dir.create("plot", showWarnings = FALSE, recursive = TRUE)

Demographic Data

This section collects and processes demographic data from NHANES, including age, sex, race, education, and family income.

# NHANES cycle metadata
cycles <- data.frame(
  code = c("C","D","E","F","G","H","I"),
  year = c(2003,2005,2007,2009,2011,2013,2015),
  label = c("2003-2004","2005-2006","2007-2008","2009-2010",
            "2011-2012","2013-2014","2015-2016"),
  stringsAsFactors = FALSE
)

# Note: load_xpt() and get_nhanes_data() functions are sourced from functions.r
# They handle downloading/loading NHANES XPT files and extracting variables

# DEMO (C, D, G, H) with year labels and renaming
DEMO <- bind_rows(lapply(c("C","D","G","H"), function(code) {
  row <- cycles[cycles$code == code, ]
  df <- load_xpt(row$year, paste0("DEMO_", code))
  df <- df[, c("SEQN","RIAGENDR","RIDAGEYR","RIDRETH1","DMDEDUC2",
               "INDFMPIR","WTINT2YR","WTMEC2YR","SDMVPSU","SDMVSTRA")]
  df$year <- row$label
  df
})) %>%
  rename(Sex = RIAGENDR,
         Age = RIDAGEYR,
         Race = RIDRETH1,
         Education = DMDEDUC2,
         PIR = INDFMPIR)

# Other datasets
BIOPRO <- get_nhanes_data(c("L40_C","BIOPRO_D","BIOPRO_E","BIOPRO_F","BIOPRO_G","BIOPRO_H","BIOPRO_I"), c("SEQN","LBXSATSI","LBXSASSI")) %>%
  rename(ALT = LBXSATSI,
         AST = LBXSASSI)

BMI <- get_nhanes_data(c("BMX_C","BMX_D","BMX_E","BMX_F","BMX_G","BMX_H","BMX_I"), c("SEQN","BMXBMI","BMXWAIST")) %>%
  rename(BMI = BMXBMI,
         Waist_circumference = BMXWAIST)

BPX_raw <- get_nhanes_data(c("BPX_C","BPX_D","BPX_E","BPX_F","BPX_G","BPX_H","BPX_I"), c("SEQN","BPXSY1","BPXSY2","BPXSY3","BPXDI1","BPXDI2","BPXDI3"))

BPX <- BPX_raw %>%
  mutate(
    SBP = rowMeans(select(., BPXSY1, BPXSY2, BPXSY3), na.rm = TRUE),
    DBP = rowMeans(select(., BPXDI1, BPXDI2, BPXDI3), na.rm = TRUE)
  ) %>%
  select(SEQN, SBP, DBP)

TRIGLY <- get_nhanes_data(c("L13AM_C","TRIGLY_D","TRIGLY_E","TRIGLY_F","TRIGLY_G","TRIGLY_H","TRIGLY_I"), c("SEQN","LBXTR")) %>%
  rename(TG = LBXTR)

HDL_raw <- get_nhanes_data(c("l13_c","HDL_D","HDL_E","HDL_F","HDL_G","HDL_H","HDL_I"), c("SEQN","LBXHDD","LBDHDD"))
HDL <- HDL_raw %>% 
  mutate(HDL = coalesce(LBXHDD, LBDHDD)) %>% 
  select(SEQN, HDL)

GLU <- get_nhanes_data(c("L10AM_C","GLU_D","GLU_E","GLU_F","GLU_G","GLU_H","GLU_I"), c("SEQN","LBXGLU")) %>%
  rename(Glucose = LBXGLU)

GHB <- get_nhanes_data(c("L10_C","GHB_D","GHB_E","GHB_F","GHB_G","GHB_H","GHB_I"), c("SEQN","LBXGH")) %>%
  rename(HbA1c = LBXGH)

DIQ <- get_nhanes_data(c("DIQ_C","DIQ_D","DIQ_E","DIQ_F","DIQ_G","DIQ_H","DIQ_I"), c("SEQN","DIQ010")) %>%
  rename(diabetes_questionnaire = DIQ010)

# Final merged dataset
df <- DEMO %>% 
  left_join(BIOPRO, by = "SEQN") %>% 
  left_join(BMI, by = "SEQN") %>% 
  left_join(BPX, by = "SEQN") %>% 
  left_join(TRIGLY, by = "SEQN") %>% 
  left_join(HDL, by = "SEQN") %>% 
  left_join(GLU, by = "SEQN") %>% 
  left_join(GHB, by = "SEQN") %>% 
  left_join(DIQ, by = "SEQN")

write.csv(df, file = "data/data_demo.csv", row.names = FALSE)

Physical Activity Data

This section processes physical activity data from NHANES accelerometer data for two time periods (2003-2006 and 2011-2014). For each participant, it calculates average physical activity intensity for each hour of the day, then summarizes activity separately for daytime (7:00-19:00) and nighttime periods.

Note: The physical activity XPT files need to be manually downloaded from the NHANES website and placed in the data/raw/ directory before running this code. You can download directly from these links: PAXRAW_C.zip, PAXRAW_D.zip, PAXHR_G.xpt, PAXHR_H.xpt. After downloading, unzip, place all XPT files in data/raw/.

# 2003-2006, PA in minutes
pa_03_06_raw <- bind_rows(
  read_xpt("data/raw/paxraw_c.xpt"),
  read_xpt("data/raw/paxraw_d.xpt")
) %>%
  select(SEQN, PAXINTEN, PAXHOUR, PAXN) %>%
  filter(PAXN <= 10080) %>%
  mutate(Day = paste0("day", ceiling(PAXN / 1440)))

# Sum PA in minutes to hours
pa_03_06_raw <- pa_03_06_raw %>%
  group_by(SEQN, PAXHOUR, Day) %>%
  summarise(PAXINTEN = sum(PAXINTEN, na.rm = TRUE), .groups = 'drop')

# Keep SEQNs that have all 7 days with exactly 24 hourly records each
valid_seqn <- pa_03_06_raw %>%
  count(SEQN, Day) %>%
  filter(n == 24) %>%
  distinct(SEQN) %>%
  pull(SEQN)

# Filter valid participants
pa_03_06_raw <- pa_03_06_raw %>%
  filter(SEQN %in% valid_seqn)

# Calculate hourly PA for each SEQN
pa_03_06_hourly <- pa_03_06_raw %>%
  group_by(SEQN, PAXHOUR) %>%
  summarise(
    PAXINTEN = mean(PAXINTEN, na.rm = TRUE),
    .groups = 'drop'
  ) %>%
  pivot_wider(
    names_from = PAXHOUR,
    names_prefix = "pa_",
    values_from = PAXINTEN
  )

# Filter valid participants
pa_03_06 <- pa_03_06_raw %>%
  group_by(SEQN) %>%
  summarise(
    mean_pa_day = mean(PAXINTEN[between(PAXHOUR, 7, 19)], na.rm = TRUE),
    mean_pa_night = mean(PAXINTEN[!between(PAXHOUR, 7, 19)], na.rm = TRUE),
    mean_pa = mean(PAXINTEN, na.rm = TRUE)
  )

# Join hourly data with summary data
pa_03_06 <- pa_03_06 %>%
  left_join(pa_03_06_hourly, by = "SEQN")

write.csv(pa_03_06, file = "data/pa_2003_2006.csv", row.names = FALSE)

# 2011-2014, PA hourly
pa_11_14_raw <- bind_rows(
  read_xpt("data/raw/PAXHR_G.xpt") %>% select(SEQN, PAXDAYH, PAXTMH, PAXMTSH),
  read_xpt("data/raw/PAXHR_H.xpt") %>% select(SEQN, PAXDAYH, PAXTMH, PAXMTSH)
) %>% 
  filter(PAXTMH == 60, PAXDAYH >= 2, PAXDAYH <= 8) %>%
  group_by(SEQN, PAXDAYH) %>%
  filter(n() == 24) %>%
  mutate(hour = (row_number() - 1) %% 24) %>%
  group_by(SEQN) %>%
  filter(n_distinct(PAXDAYH) == length(unique(PAXDAYH)))

# Calculate hourly PA for each SEQN
pa_11_14_hourly <- pa_11_14_raw %>%
  group_by(SEQN, hour) %>%
  summarise(
    PAXMTSH = mean(PAXMTSH, na.rm = TRUE),
    .groups = 'drop'
  ) %>%
  pivot_wider(
    names_from = hour,
    names_prefix = "pa_",
    values_from = PAXMTSH
  )

pa_11_14 <- pa_11_14_raw %>%
  summarise(
    mean_pa_day = mean(PAXMTSH[between(hour, 7, 19)], na.rm = TRUE),
    mean_pa_night = mean(PAXMTSH[!between(hour, 7, 19)], na.rm = TRUE),
    mean_pa = mean(PAXMTSH, na.rm = TRUE)
  )

# Join hourly data with summary data
pa_11_14 <- pa_11_14 %>%
  left_join(pa_11_14_hourly, by = "SEQN")

write.csv(pa_11_14, file = "data/pa_2011_2014.csv", row.names = FALSE)


# Combine physical activity data
pa_03_06 = read.csv("data/pa_2003_2006.csv", stringsAsFactors = FALSE)
pa_11_14 = read.csv("data/pa_2011_2014.csv", stringsAsFactors = FALSE)

# Note: categorize_pa() function is sourced from functions.r
# It categorizes physical activity into quartiles: Low, Medium, High, Very High

# Process data for both periods
pa_03_06 <- pa_03_06 %>%
  mutate(
    physical_activity = categorize_pa(., "mean_pa"),
    physical_activity_day = categorize_pa(., "mean_pa_day"),
    physical_activity_night = categorize_pa(., "mean_pa_night")
  )

pa_11_14 <- pa_11_14 %>%
  mutate(
    physical_activity = categorize_pa(., "mean_pa"),
    physical_activity_day = categorize_pa(., "mean_pa_day"),
    physical_activity_night = categorize_pa(., "mean_pa_night")
  )

# Combine the processed data
pa <- bind_rows(pa_03_06, pa_11_14)

write.csv(pa, file = "data/pa_combined.csv", row.names = FALSE)

Combine Data

This section merges demographic and physical activity data, creating a comprehensive dataset for analysis.

df <- read.csv("data/data_demo.csv", stringsAsFactors = FALSE)
pa = read.csv("data/pa_combined.csv", stringsAsFactors = FALSE)

df <- left_join(df, pa, by = 'SEQN')

Diagnosis

This section defines and calculates diagnostic criteria for key health conditions in adolescents. Diabetes is diagnosed using fasting glucose ≥126 mg/dL, HbA1c ≥6.5%, or self-reported diagnosis. Obesity is determined using age- and sex-specific BMI thresholds and waist circumference criteria. Metabolic syndrome is defined using age-specific criteria for blood pressure, triglycerides, HDL cholesterol, and TG/HDL ratio. MAFLD (Metabolic dysfunction-Associated Fatty Liver Disease) is diagnosed using the Hepatic Steatosis Index (HSI ≥36) plus presence of diabetes, obesity, or metabolic syndrome.

# Diabetes diagnosis (>126mg/dl)
df$Diabetes <- ifelse(
  is.na(df$diabetes_questionnaire) & is.na(df$Glucose) & is.na(df$HbA1c), NA,
  ifelse(df$diabetes_questionnaire == 1, 1,
    ifelse(df$diabetes_questionnaire == 2, 0,
      ifelse(df$Glucose >= 126, 1,
        ifelse(df$HbA1c >= 6.5, 1, 0)
      )
    )
  )
)

# HSI calculation and assessment
df <- df %>%
  mutate(
    Sex = ifelse(Sex == 2, 1, ifelse(Sex == 1, 0, Sex)),
    HSI = 8 * (ALT / AST) + BMI + 2 * Diabetes + 2 * Sex,
    HSI_assessment = ifelse(HSI >= 36, 1, 0)
  )

# Obesity diagnosis
bmi_medians <- data.frame(
  Age = 6:19,
  Boy_SD = c(16.9,17.2,17.6,18.1,18.8,19.5,20.3,21.2,22.2,23.0,23.8,24.5,25.1,25.4),
  Girl_SD = c(17.1,17.5,18.0,18.6,19.4,20.2,21.2,22.2,23.1,23.8,24.3,24.6,24.9,25.0)
)

thresholds <- data.frame(
  Age = rep(6:19, each = 2),
  Sex = rep(c(1, 0), times = 14),
  waist_threshold = c(61.2,63.2,60.2,63.6,62.5,66.8,65.1,70.0,67.8,73.1,70.4,75.6,72.6,77.4,
                     74.0,78.6,74.9,79.6,75.5,80.5,75.8,81.3,76.0,82.1,76.1,83.0,76.1,83.0)
)

df <- df %>%
  left_join(bmi_medians, by = "Age") %>%
  left_join(thresholds, by = c("Age", "Sex")) %>%
  mutate(
    obesity1 = ifelse(Sex == 0 & BMI > Boy_SD | Sex == 1 & BMI > Girl_SD, "yes", "no"),
    obesity2 = ifelse(Waist_circumference >= waist_threshold, "yes", "no"),
    Obesity = ifelse(obesity1 == "yes" | obesity2 == "yes", 1, 0)
  ) %>%
  select(-Boy_SD, -Girl_SD, -waist_threshold, -obesity1, -obesity2)

# Metabolic syndrome diagnosis
df <- df %>%
  mutate(
    condition_count = rowSums(
      cbind(
        TG > 100,           # Triglycerides > 100mg/dL
        HDL < 38.67,        # HDL < 38.67
        SBP > 120,           # Systolic BP > 120 mmHg
        DBP > 80,            # Diastolic BP > 80 mmHg
        (TG / HDL) > 2.25 # TG/HDL ratio > 2.25
      ),
      na.rm = TRUE
    ),
    condition_count_10_19 = rowSums(
      cbind(
        SBP > 130,           # Systolic BP > 130 mmHg
        DBP > 85,            # Diastolic BP > 85 mmHg
        TG > 150,           # Triglycerides > 150 mg/dL
        HDL < 40,           # HDL < 40 mg/dL
        (TG / HDL) > 2.25 # TG/HDL ratio > 2.25
      ),
      na.rm = TRUE
    ),
    metabolic_syndrome = ifelse(
      (Age >= 2 & Age <= 9 & condition_count >= 2) | 
      (Age >= 10 & Age <= 19 & condition_count_10_19 >= 2),
      1, 0
    )
  ) %>%
  select(-condition_count, -condition_count_10_19)

# MAFLD diagnosis
df$MAFLD <- ifelse(df$HSI_assessment == 1 & (df$Diabetes == 1 | df$Obesity == 1 | df$metabolic_syndrome == 1), 1, 0)

# Save dataset
write.csv(df, file = "data/data_withDiagnosis.csv", row.names = FALSE)

Data Cleaning

This section filters the data to include only adolescents aged 12-19 years, recodes categorical variables with meaningful labels, and removes incomplete cases. The cleaned dataset combines both time periods (2003-2006 and 2011-2014) for analysis.

df = read.csv("data/data_withDiagnosis.csv", stringsAsFactors = FALSE) 

# First convert Age to numeric for filtering
df = df %>%
  mutate(Age = as.numeric(Age))

# Filter age first
df = df %>%
  filter(Age >= 12 & Age <= 19)

# Then do the rest of the transformations
df = df %>%
  mutate(
    year = ifelse(year %in% c("2003-2004","2005-2006"), "2003-2006", "2011-2014"),
    Sex = recode(Sex, '0' = "Male", '1' = "Female"),
    Race = case_when(
      Race == 1 ~ 'Mexican',
      Race == 3 ~ 'White',
      Race == 4 ~ 'Black',
      TRUE ~ 'Other'
    ) %>% factor(levels = c('Mexican', 'White', 'Black', 'Other')),
    Family_income = case_when(
      PIR <= 1.3 ~ "Low",
      PIR > 1.3 & PIR <= 3.5 ~ "Medium",
      TRUE ~ "High"
    ) %>% factor(levels = c('Low', 'Medium', 'High')),
    Diabetes = recode(Diabetes, '1' = "yes", '0' = "no") %>% factor(levels = c("no", "yes")),
    Obesity = recode(Obesity, '1' = "yes", '0' = "no") %>% factor(levels = c("no", "yes")),
    Sex = factor(Sex, levels = c('Male', 'Female')),
    physical_activity = factor(physical_activity, levels = c("Low", "Medium", "High", "Very High"), ordered = TRUE),
    physical_activity_day = factor(physical_activity_day, levels = c("Low", "Medium", "High", "Very High"), ordered = TRUE),
    physical_activity_night = factor(physical_activity_night, levels = c("Low", "Medium", "High", "Very High"), ordered = TRUE)
  )

# Check final age distribution
print("Age distribution after filtering:")
table(df$Age, df$year)

# Filter complete cases
df = df %>% filter(complete.cases(MAFLD, physical_activity))

# Save dataset
save(df, file = "data/data_clean.RData")

Baseline Characteristics

This section generates descriptive statistics for the study population, comparing participants with and without MAFLD. Tables are created separately for each time period (2003-2006 and 2011-2014) using the gtsummary package, with statistical tests comparing groups (t-tests for continuous variables, chi-square tests for categorical variables).

load("data/data_clean.RData")

for (range in c("2003-2006", "2011-2014")) {
  data <- df %>% filter(year == range)
  
  tbl <- data %>%
    select(
      MAFLD, Sex, Age, Race, Family_income,
      BMI, ALT, AST, SBP, DBP,
      Diabetes, Obesity, physical_activity
    ) %>%
    tbl_summary(
      by = MAFLD,
      type = list(
        c(Age, BMI, ALT, AST, SBP, DBP) ~ "continuous"
      ),
      statistic = list(
        all_continuous() ~ "{mean} ({sd})",
        all_categorical() ~ "{n} ({p}%)"
      ),
      missing = "no"
    ) %>%
    add_p() %>%
    add_overall()
  
  # Convert to data.frame and write to CSV
  tbl_df <- as_tibble(as.data.frame(tbl))
  write.csv(tbl_df, file = paste0("result/baseline_", range, ".csv"), row.names = FALSE)
}

Regression Analysis

This section performs regression analysis to examine the association between physical activity and MAFLD. The analysis uses survey-weighted logistic regression with three models: unadjusted (Model 1), age and sex adjusted (Model 2), and fully adjusted for age, sex, race, and income (Model 3). All regression functions (build_formula(), calculate_ci(), format_results(), and perform_regression()) are sourced from functions.r.

# ===============================================
# Regression Analysis
# ===============================================
# Note: All regression functions are sourced from functions.r:
# - build_formula(): Constructs regression formulas
# - calculate_ci(): Calculates confidence intervals using Wald method
# - format_results(): Formats regression results with ORs and CIs
# - perform_regression(): Main function that runs three models

load("data/data_clean.RData")

# Run analysis for each year period
results_list <- lapply(c("2003-2006", "2011-2014"), function(period) {
  # Filter data for the period
  df_period <- df %>% filter(year == period)
  
  # Create survey design for the period
  nhs <- svydesign(data=df_period, ids=~SDMVPSU, strata = ~SDMVSTRA,
                   weights=~WTINT2YR, nest = TRUE)
  
  # Run analysis for all activity variables
  results <- do.call(rbind, lapply(c("physical_activity", "physical_activity_day", "physical_activity_night"), 
                                  function(var) {
                                    res <- perform_regression(var, nhs)
                                    res$Activity <- rep(c("All", "Day", "Night")[match(var, c("physical_activity", "physical_activity_day", "physical_activity_night"))], 
                                                      nrow(res))
                                    res$Year <- period
                                    res
                                  }))
})

# Combine results from both periods
results <- do.call(rbind, results_list)

# Save results
write.csv(results, file = "result/regression.csv", row.names = FALSE)

# ===============================================
# Regression Analysis for subgroup
# ===============================================
# Note: perform_subgroup_regression() function is sourced from functions.r
# Some subgroups use regular logistic regression instead of survey-weighted analysis
# due to insufficient PSUs (Primary Sampling Units) within strata after stratification.
# The function automatically falls back to regular logistic regression when needed.

# age group
df = df %>%
  mutate(
    Age_group = cut(Age, 
                   breaks = c(12, 15, 17, 19),
                   labels = c("12-14", "15-16", "17-19"),
                   include.lowest = TRUE)
  )

# Run subgroup analysis for each year period
subgroup_results_list <- lapply(c("2003-2006", "2011-2014"), function(period) {
  # Filter data for the period
  df_period <- df %>% filter(year == period)
  
  # Run analysis for each subgroup variable
  subgroup_vars <- c("Age_group", "Sex", "Race", "PIR")
  results <- lapply(subgroup_vars, function(subgroup_var) {
    # Get unique values of subgroup variable
    if (subgroup_var == "Age_group") {
      subgroups <- levels(df_period$Age_group)
    } else {
      subgroups <- levels(df_period[[subgroup_var]])
    }
    
    # Run analysis for each subgroup
    subgroup_results <- lapply(subgroups, function(subgroup) {
      # Filter data for the subgroup
      if (subgroup_var == "Age_group") {
        subgroup_data <- df_period[df_period$Age_group == subgroup, ]
      } else {
        subgroup_data <- df_period[df_period[[subgroup_var]] == subgroup, ]
      }
      
      # Skip if subgroup is too small
      if (nrow(subgroup_data) < 20) {
        print(paste("Skipping subgroup", subgroup, "due to small sample size"))
        return(NULL)
      }
      
      # Run analysis for all activity variables
      tryCatch({
        activity_results <- do.call(rbind, lapply(c("physical_activity", "physical_activity_day", "physical_activity_night"), 
                                                function(var) {
                                                  res <- perform_subgroup_regression(var, subgroup_var, subgroup_data)
                                                  res$Activity <- rep(c("All", "Day", "Night")[match(var, c("physical_activity", "physical_activity_day", "physical_activity_night"))], 
                                                                    nrow(res))
                                                  res$Subgroup <- as.character(subgroup)
                                                  res$Subgroup_Var <- subgroup_var
                                                  res
                                                }))
      }, error = function(e) {
        print(paste("Error in subgroup", subgroup, ":", e$message))
        return(NULL)
      })
    })
    
    # Remove NULL results
    subgroup_results <- subgroup_results[!sapply(subgroup_results, is.null)]
    if (length(subgroup_results) == 0) return(NULL)
    do.call(rbind, subgroup_results)
  })
  
  # Remove NULL results
  results <- results[!sapply(results, is.null)]
  if (length(results) == 0) return(NULL)
  results_df <- do.call(rbind, results)
  results_df$Year <- period
  results_df
})

# Remove NULL results and combine
subgroup_results_list <- subgroup_results_list[!sapply(subgroup_results_list, is.null)]
subgroup_results <- do.call(rbind, subgroup_results_list)

# Save results
write.csv(subgroup_results, file = "result/regression_subgroup.csv", row.names = FALSE)

Visualization

This section creates visualizations to explore the relationship between physical activity and MAFLD. The plotting functions (create_violin_plot() and my_theme) are sourced from functions.r.

# ===============================================
# Plot: Hourly physical activity patterns
# ===============================================
# For visualization purposes, values above the 99th percentile were excluded.

# Reshape data for plotting
df_long <- df %>%
  select(SEQN, year, MAFLD, starts_with("pa_")) %>%
  pivot_longer(
    cols = starts_with("pa_"),
    names_to = "hour",
    values_to = "mean_physical_activity"
  ) %>%
  mutate(hour = as.numeric(gsub("pa_", "", hour)))

# Calculate 99th percentile for each period and MAFLD status
quantiles <- df_long %>%
  group_by(year, MAFLD) %>%
  summarise(q99 = quantile(mean_physical_activity, 0.99, na.rm = TRUE), .groups = "drop")


# Create violin plots for both periods
violin_plots <- list(
  # 2003-2006 (no x-label, no legend)
  "2003_2006" = create_violin_plot(
    filter(df_long, year == "2003-2006",
           mean_physical_activity <= filter(quantiles, year == "2003-2006", MAFLD == ifelse(MAFLD == 1, 1, 0))$q99),
    "2003-2006", show_x_label = FALSE, show_legend = FALSE),
  
  # 2011-2014 (show x-label and legend)
  "2011_2014" = create_violin_plot(
    filter(df_long, year == "2011-2014",
           mean_physical_activity <= filter(quantiles, year == "2011-2014", MAFLD == ifelse(MAFLD == 1, 1, 0))$q99),
    "2011-2014", show_x_label = TRUE, show_legend = TRUE)
)

# Combine violin plots
violin_combined <- grid.arrange(grobs = violin_plots, ncol = 1, 
                               padding = unit(0.5, "lines"))

# ===============================================
# Plot: MAFLD proportion by physical activity level
# ===============================================
# Note: my_theme is sourced from functions.r

prop_data <- df %>%
  pivot_longer(
    cols = c(physical_activity, physical_activity_day, physical_activity_night),
    names_to = "activity_type",
    values_to = "activity_level"
  ) %>%
  mutate(
    time_period = case_when(
      activity_type == "physical_activity" ~ "Overall",
      activity_type == "physical_activity_day" ~ "Daytime",
      activity_type == "physical_activity_night" ~ "Nighttime"
    ),
    time_period = factor(time_period, levels = c("Overall", "Daytime", "Nighttime")),
    activity_level = factor(activity_level, levels = c("Q1", "Q2", "Q3", "Q4"))
  ) %>%
  group_by(year, time_period, activity_level) %>%
  summarise(
    total = n(),
    mafld = sum(MAFLD == 1),
    non_mafld = total - mafld,
    .groups = "drop"
  ) %>%
  pivot_longer(
    cols = c(mafld, non_mafld),
    names_to = "status",
    values_to = "count"
  ) %>%
  group_by(year, time_period, activity_level) %>%
  mutate(
    proportion = count / sum(count),
    status = factor(status, levels = c("non_mafld", "mafld"),
                    labels = c("Non-MAFLD", "MAFLD"))
  )

# Create proportion plot
proportion_plot <- ggplot(prop_data, aes(x = activity_level, y = proportion, fill = status)) +
  geom_bar(stat = "identity", width = 0.8) +
  geom_text(aes(label = scales::percent(proportion, accuracy = 1)),
            position = position_stack(vjust = 0.5),
            size = 5, color = "white") +
  facet_grid(time_period ~ year) +
  scale_fill_manual(values = c("Non-MAFLD" = "#4A9B7E", "MAFLD" = "#D2603A")) +
  scale_y_continuous(labels = scales::percent_format(accuracy = 1),
                     breaks = seq(0, 1, 0.25)) +
  labs(x = "  Activity counts                 MIMS-units",
       y = "Proportion",
       fill = "Status") +
  my_theme +
  theme(plot.title = element_text(size = 18, hjust = 0.5, family = "Helvetica"))

# ===============================================
# Combine both plots into one plot
# ===============================================
# Combine all plots
combined_comprehensive_plot <- grid.arrange(
  violin_combined,
  proportion_plot,
  ncol = 2,
  widths = c(3, 2),
  padding = unit(1, "lines")
)

# Save the combined plot
ggsave("plot/combined_pa_mafld_analysis.png", combined_comprehensive_plot,
       width = 16, height = 6, dpi = 600, bg = "white")

Comments and feedbacks

Find me via [email protected]