MAFLD and Physical Activity

Brief
Sample selection flow
Preparation
Demographic Data
Physical Activity Data
Combine Data
Diagnosis
Data Cleaning
Baseline Characteristics
Regression Analysis
Visualization

Brief

This analysis examines the relationship between physical activity patterns and metabolic dysfunction-associated fatty liver disease (MAFLD) in adolescents aged 12-19 years using NHANES data from 2003-2006 and 2011-2014.

We use accelerometer data to objectively measure physical activity across different times of day (overall, daytime 7:00-19:00, and nighttime). MAFLD is diagnosed using the Hepatic Steatosis Index combined with metabolic risk factors. We employ survey-weighted logistic regression to account for the complex sampling design of NHANES.

Sample selection flow

Preparation

This section sets up the required R packages and creates necessary directories for data storage.

# Required R packages
library(foreign)
library(dplyr)
library(survey)
library(haven)
library(tidyr)
library(gtsummary)
library(ggplot2)
library(gridExtra)
options(timeout = 1000)

# Source custom functions
# You can either source directly from URL or download the file first
# Option 1: Source from URL
source("https://zhanghaoyang.net/project/mafld_pa/functions.r")

# Option 2: Download and source locally
# source("functions.r")

# Create folder if it doesn't exist
dir.create("data/raw", showWarnings = FALSE, recursive = TRUE)
dir.create("result", showWarnings = FALSE, recursive = TRUE)
dir.create("plot", showWarnings = FALSE, recursive = TRUE)

Demographic Data

This section collects and processes demographic data from NHANES, including age, sex, race, education, and family income.

# NHANES cycle metadata
cycles <- data.frame(
  code = c("C","D","E","F","G","H","I"),
  year = c(2003,2005,2007,2009,2011,2013,2015),
  label = c("2003-2004","2005-2006","2007-2008","2009-2010",
            "2011-2012","2013-2014","2015-2016"),
  stringsAsFactors = FALSE
)

# Note: load_xpt() and get_nhanes_data() functions are sourced from functions.r
# They handle downloading/loading NHANES XPT files and extracting variables

# DEMO (C, D, G, H) with year labels and renaming
DEMO <- bind_rows(lapply(c("C","D","G","H"), function(code) {
  row <- cycles[cycles$code == code, ]
  df <- load_xpt(row$year, paste0("DEMO_", code))
  df <- df[, c("SEQN","RIAGENDR","RIDAGEYR","RIDRETH1","DMDEDUC2",
               "INDFMPIR","WTINT2YR","WTMEC2YR","SDMVPSU","SDMVSTRA")]
  df$year <- row$label
  df
})) %>%
  rename(Sex = RIAGENDR,
         Age = RIDAGEYR,
         Race = RIDRETH1,
         Education = DMDEDUC2,
         PIR = INDFMPIR)

# Other datasets
BIOPRO <- get_nhanes_data(c("L40_C","BIOPRO_D","BIOPRO_E","BIOPRO_F","BIOPRO_G","BIOPRO_H","BIOPRO_I"), c("SEQN","LBXSATSI","LBXSASSI")) %>%
  rename(ALT = LBXSATSI,
         AST = LBXSASSI)

BMI <- get_nhanes_data(c("BMX_C","BMX_D","BMX_E","BMX_F","BMX_G","BMX_H","BMX_I"), c("SEQN","BMXBMI","BMXWAIST")) %>%
  rename(BMI = BMXBMI,
         Waist_circumference = BMXWAIST)

BPX_raw <- get_nhanes_data(c("BPX_C","BPX_D","BPX_E","BPX_F","BPX_G","BPX_H","BPX_I"), c("SEQN","BPXSY1","BPXSY2","BPXSY3","BPXDI1","BPXDI2","BPXDI3"))

BPX <- BPX_raw %>%
  mutate(
    SBP = rowMeans(select(., BPXSY1, BPXSY2, BPXSY3), na.rm = TRUE),
    DBP = rowMeans(select(., BPXDI1, BPXDI2, BPXDI3), na.rm = TRUE)
  ) %>%
  select(SEQN, SBP, DBP)

TRIGLY <- get_nhanes_data(c("L13AM_C","TRIGLY_D","TRIGLY_E","TRIGLY_F","TRIGLY_G","TRIGLY_H","TRIGLY_I"), c("SEQN","LBXTR")) %>%
  rename(TG = LBXTR)

HDL_raw <- get_nhanes_data(c("l13_c","HDL_D","HDL_E","HDL_F","HDL_G","HDL_H","HDL_I"), c("SEQN","LBXHDD","LBDHDD"))
HDL <- HDL_raw %>% 
  mutate(HDL = coalesce(LBXHDD, LBDHDD)) %>% 
  select(SEQN, HDL)

GLU <- get_nhanes_data(c("L10AM_C","GLU_D","GLU_E","GLU_F","GLU_G","GLU_H","GLU_I"), c("SEQN","LBXGLU")) %>%
  rename(Glucose = LBXGLU)

GHB <- get_nhanes_data(c("L10_C","GHB_D","GHB_E","GHB_F","GHB_G","GHB_H","GHB_I"), c("SEQN","LBXGH")) %>%
  rename(HbA1c = LBXGH)

DIQ <- get_nhanes_data(c("DIQ_C","DIQ_D","DIQ_E","DIQ_F","DIQ_G","DIQ_H","DIQ_I"), c("SEQN","DIQ010")) %>%
  rename(diabetes_questionnaire = DIQ010)

# Final merged dataset
df <- DEMO %>% 
  left_join(BIOPRO, by = "SEQN") %>% 
  left_join(BMI, by = "SEQN") %>% 
  left_join(BPX, by = "SEQN") %>% 
  left_join(TRIGLY, by = "SEQN") %>% 
  left_join(HDL, by = "SEQN") %>% 
  left_join(GLU, by = "SEQN") %>% 
  left_join(GHB, by = "SEQN") %>% 
  left_join(DIQ, by = "SEQN")

write.csv(df, file = "data/data_demo.csv", row.names = FALSE)

Physical Activity Data

This section processes physical activity data from NHANES accelerometer data for two time periods (2003-2006 and 2011-2014). For each participant, it calculates average physical activity intensity for each hour of the day, then summarizes activity separately for daytime (7:00-19:00) and nighttime periods.

Note: The physical activity XPT files need to be manually downloaded from the NHANES website and placed in the data/raw/ directory before running this code. You can download directly from these links: PAXRAW_C.zip, PAXRAW_D.zip, PAXHR_G.xpt, PAXHR_H.xpt. After downloading, unzip, place all XPT files in data/raw/.

# 2003-2006, PA in minutes
pa_03_06_raw <- bind_rows(
  read_xpt("data/raw/paxraw_c.xpt"),
  read_xpt("data/raw/paxraw_d.xpt")
) %>%
  select(SEQN, PAXINTEN, PAXHOUR, PAXN) %>%
  filter(PAXN <= 10080) %>%
  mutate(Day = paste0("day", ceiling(PAXN / 1440)))

# Sum PA in minutes to hours
pa_03_06_raw <- pa_03_06_raw %>%
  group_by(SEQN, PAXHOUR, Day) %>%
  summarise(PAXINTEN = sum(PAXINTEN, na.rm = TRUE), .groups = 'drop')

# Keep SEQNs that have all 7 days with exactly 24 hourly records each
valid_seqn <- pa_03_06_raw %>%
  count(SEQN, Day) %>%
  filter(n == 24) %>%
  distinct(SEQN) %>%
  pull(SEQN)

# Filter valid participants
pa_03_06_raw <- pa_03_06_raw %>%
  filter(SEQN %in% valid_seqn)

# Calculate hourly PA for each SEQN
pa_03_06_hourly <- pa_03_06_raw %>%
  group_by(SEQN, PAXHOUR) %>%
  summarise(
    PAXINTEN = mean(PAXINTEN, na.rm = TRUE),
    .groups = 'drop'
  ) %>%
  pivot_wider(
    names_from = PAXHOUR,
    names_prefix = "pa_",
    values_from = PAXINTEN
  )

# Filter valid participants
pa_03_06 <- pa_03_06_raw %>%
  group_by(SEQN) %>%
  summarise(
    mean_pa_day = mean(PAXINTEN[between(PAXHOUR, 7, 19)], na.rm = TRUE),
    mean_pa_night = mean(PAXINTEN[!between(PAXHOUR, 7, 19)], na.rm = TRUE),
    mean_pa = mean(PAXINTEN, na.rm = TRUE)
  )

# Join hourly data with summary data
pa_03_06 <- pa_03_06 %>%
  left_join(pa_03_06_hourly, by = "SEQN")

write.csv(pa_03_06, file = "data/pa_2003_2006.csv", row.names = FALSE)

# 2011-2014, PA hourly
pa_11_14_raw <- bind_rows(
  read_xpt("data/raw/PAXHR_G.xpt") %>% select(SEQN, PAXDAYH, PAXTMH, PAXMTSH),
  read_xpt("data/raw/PAXHR_H.xpt") %>% select(SEQN, PAXDAYH, PAXTMH, PAXMTSH)
) %>% 
  filter(PAXTMH == 60, PAXDAYH >= 2, PAXDAYH <= 8) %>%
  group_by(SEQN, PAXDAYH) %>%
  filter(n() == 24) %>%
  mutate(hour = (row_number() - 1) %% 24) %>%
  group_by(SEQN) %>%
  filter(n_distinct(PAXDAYH) == length(unique(PAXDAYH)))

# Calculate hourly PA for each SEQN
pa_11_14_hourly <- pa_11_14_raw %>%
  group_by(SEQN, hour) %>%
  summarise(
    PAXMTSH = mean(PAXMTSH, na.rm = TRUE),
    .groups = 'drop'
  ) %>%
  pivot_wider(
    names_from = hour,
    names_prefix = "pa_",
    values_from = PAXMTSH
  )

pa_11_14 <- pa_11_14_raw %>%
  summarise(
    mean_pa_day = mean(PAXMTSH[between(hour, 7, 19)], na.rm = TRUE),
    mean_pa_night = mean(PAXMTSH[!between(hour, 7, 19)], na.rm = TRUE),
    mean_pa = mean(PAXMTSH, na.rm = TRUE)
  )

# Join hourly data with summary data
pa_11_14 <- pa_11_14 %>%
  left_join(pa_11_14_hourly, by = "SEQN")

write.csv(pa_11_14, file = "data/pa_2011_2014.csv", row.names = FALSE)


# Combine physical activity data
pa_03_06 = read.csv("data/pa_2003_2006.csv", stringsAsFactors = FALSE)
pa_11_14 = read.csv("data/pa_2011_2014.csv", stringsAsFactors = FALSE)

# Note: categorize_pa() function is sourced from functions.r
# It categorizes physical activity into quartiles: Low, Medium, High, Very High

# Process data for both periods
pa_03_06 <- pa_03_06 %>%
  mutate(
    physical_activity = categorize_pa(., "mean_pa"),
    physical_activity_day = categorize_pa(., "mean_pa_day"),
    physical_activity_night = categorize_pa(., "mean_pa_night")
  )

pa_11_14 <- pa_11_14 %>%
  mutate(
    physical_activity = categorize_pa(., "mean_pa"),
    physical_activity_day = categorize_pa(., "mean_pa_day"),
    physical_activity_night = categorize_pa(., "mean_pa_night")
  )

# Combine the processed data
pa <- bind_rows(pa_03_06, pa_11_14)

write.csv(pa, file = "data/pa_combined.csv", row.names = FALSE)

Combine Data

This section merges demographic and physical activity data, creating a comprehensive dataset for analysis.

df <- read.csv("data/data_demo.csv", stringsAsFactors = FALSE)
pa = read.csv("data/pa_combined.csv", stringsAsFactors = FALSE)

df <- left_join(df, pa, by = 'SEQN')

Diagnosis

This section defines and calculates diagnostic criteria for key health conditions in adolescents. Diabetes is diagnosed using fasting glucose ≥126 mg/dL, HbA1c ≥6.5%, or self-reported diagnosis. Obesity is determined using age- and sex-specific BMI thresholds and waist circumference criteria. Metabolic syndrome is defined using age-specific criteria for blood pressure, triglycerides, HDL cholesterol, and TG/HDL ratio. MAFLD (Metabolic dysfunction-Associated Fatty Liver Disease) is diagnosed using the Hepatic Steatosis Index (HSI ≥36) plus presence of diabetes, obesity, or metabolic syndrome.

# Diabetes diagnosis (>126mg/dl)
df$Diabetes <- ifelse(
  is.na(df$diabetes_questionnaire) & is.na(df$Glucose) & is.na(df$HbA1c), NA,
  ifelse(df$diabetes_questionnaire == 1, 1,
    ifelse(df$diabetes_questionnaire == 2, 0,
      ifelse(df$Glucose >= 126, 1,
        ifelse(df$HbA1c >= 6.5, 1, 0)
      )
    )
  )
)

# HSI calculation and assessment
df <- df %>%
  mutate(
    Sex = ifelse(Sex == 2, 1, ifelse(Sex == 1, 0, Sex)),
    HSI = 8 * (ALT / AST) + BMI + 2 * Diabetes + 2 * Sex,
    HSI_assessment = ifelse(HSI >= 36, 1, 0)
  )

# Obesity diagnosis
bmi_medians <- data.frame(
  Age = 6:19,
  Boy_SD = c(16.9,17.2,17.6,18.1,18.8,19.5,20.3,21.2,22.2,23.0,23.8,24.5,25.1,25.4),
  Girl_SD = c(17.1,17.5,18.0,18.6,19.4,20.2,21.2,22.2,23.1,23.8,24.3,24.6,24.9,25.0)
)

thresholds <- data.frame(
  Age = rep(6:19, each = 2),
  Sex = rep(c(1, 0), times = 14),
  waist_threshold = c(61.2,63.2,60.2,63.6,62.5,66.8,65.1,70.0,67.8,73.1,70.4,75.6,72.6,77.4,
                     74.0,78.6,74.9,79.6,75.5,80.5,75.8,81.3,76.0,82.1,76.1,83.0,76.1,83.0)
)

df <- df %>%
  left_join(bmi_medians, by = "Age") %>%
  left_join(thresholds, by = c("Age", "Sex")) %>%
  mutate(
    obesity1 = ifelse(Sex == 0 & BMI > Boy_SD | Sex == 1 & BMI > Girl_SD, "yes", "no"),
    obesity2 = ifelse(Waist_circumference >= waist_threshold, "yes", "no"),
    Obesity = ifelse(obesity1 == "yes" | obesity2 == "yes", 1, 0)
  ) %>%
  select(-Boy_SD, -Girl_SD, -waist_threshold, -obesity1, -obesity2)

# Metabolic syndrome diagnosis
df <- df %>%
  mutate(
    condition_count = rowSums(
      cbind(
        TG > 100,           # Triglycerides > 100mg/dL
        HDL < 38.67,        # HDL < 38.67
        SBP > 120,           # Systolic BP > 120 mmHg
        DBP > 80,            # Diastolic BP > 80 mmHg
        (TG / HDL) > 2.25 # TG/HDL ratio > 2.25
      ),
      na.rm = TRUE
    ),
    condition_count_10_19 = rowSums(
      cbind(
        SBP > 130,           # Systolic BP > 130 mmHg
        DBP > 85,            # Diastolic BP > 85 mmHg
        TG > 150,           # Triglycerides > 150 mg/dL
        HDL < 40,           # HDL < 40 mg/dL
        (TG / HDL) > 2.25 # TG/HDL ratio > 2.25
      ),
      na.rm = TRUE
    ),
    metabolic_syndrome = ifelse(
      (Age >= 2 & Age <= 9 & condition_count >= 2) | 
      (Age >= 10 & Age <= 19 & condition_count_10_19 >= 2),
      1, 0
    )
  ) %>%
  select(-condition_count, -condition_count_10_19)

# MAFLD diagnosis
df$MAFLD <- ifelse(df$HSI_assessment == 1 & (df$Diabetes == 1 | df$Obesity == 1 | df$metabolic_syndrome == 1), 1, 0)

# Save dataset
write.csv(df, file = "data/data_withDiagnosis.csv", row.names = FALSE)

Data Cleaning

This section filters the data to include only adolescents aged 12-19 years, recodes categorical variables with meaningful labels, and removes incomplete cases. The cleaned dataset combines both time periods (2003-2006 and 2011-2014) for analysis.

df = read.csv("data/data_withDiagnosis.csv", stringsAsFactors = FALSE) 

# First convert Age to numeric for filtering
df = df %>%
  mutate(Age = as.numeric(Age))

# Filter age first
df = df %>%
  filter(Age >= 12 & Age <= 19)

# Then do the rest of the transformations
df = df %>%
  mutate(
    year = ifelse(year %in% c("2003-2004","2005-2006"), "2003-2006", "2011-2014"),
    Sex = recode(Sex, '0' = "Male", '1' = "Female"),
    Race = case_when(
      Race == 1 ~ 'Mexican',
      Race == 3 ~ 'White',
      Race == 4 ~ 'Black',
      TRUE ~ 'Other'
    ) %>% factor(levels = c('Mexican', 'White', 'Black', 'Other')),
    Family_income = case_when(
      PIR <= 1.3 ~ "Low",
      PIR > 1.3 & PIR <= 3.5 ~ "Medium",
      TRUE ~ "High"
    ) %>% factor(levels = c('Low', 'Medium', 'High')),
    Diabetes = recode(Diabetes, '1' = "yes", '0' = "no") %>% factor(levels = c("no", "yes")),
    Obesity = recode(Obesity, '1' = "yes", '0' = "no") %>% factor(levels = c("no", "yes")),
    Sex = factor(Sex, levels = c('Male', 'Female')),
    physical_activity = factor(physical_activity, levels = c("Low", "Medium", "High", "Very High"), ordered = TRUE),
    physical_activity_day = factor(physical_activity_day, levels = c("Low", "Medium", "High", "Very High"), ordered = TRUE),
    physical_activity_night = factor(physical_activity_night, levels = c("Low", "Medium", "High", "Very High"), ordered = TRUE)
  )

# Check final age distribution
print("Age distribution after filtering:")
table(df$Age, df$year)

# Filter complete cases
df = df %>% filter(complete.cases(MAFLD, physical_activity))

# Save dataset
save(df, file = "data/data_clean.RData")

Baseline Characteristics

This section generates descriptive statistics for the study population, comparing participants with and without MAFLD. Tables are created separately for each time period (2003-2006 and 2011-2014) using the gtsummary package, with statistical tests comparing groups (t-tests for continuous variables, chi-square tests for categorical variables).

load("data/data_clean.RData")

for (range in c("2003-2006", "2011-2014")) {
  data <- df %>% filter(year == range)
  
  tbl <- data %>%
    select(
      MAFLD, Sex, Age, Race, Family_income,
      BMI, ALT, AST, SBP, DBP,
      Diabetes, Obesity, physical_activity
    ) %>%
    tbl_summary(
      by = MAFLD,
      type = list(
        c(Age, BMI, ALT, AST, SBP, DBP) ~ "continuous"
      ),
      statistic = list(
        all_continuous() ~ "{mean} ({sd})",
        all_categorical() ~ "{n} ({p}%)"
      ),
      missing = "no"
    ) %>%
    add_p() %>%
    add_overall()
  
  # Convert to data.frame and write to CSV
  tbl_df <- as_tibble(as.data.frame(tbl))
  write.csv(tbl_df, file = paste0("result/baseline_", range, ".csv"), row.names = FALSE)
}

Regression Analysis

This section performs regression analysis to examine the association between physical activity and MAFLD. The analysis uses survey-weighted logistic regression with three models: unadjusted (Model 1), age and sex adjusted (Model 2), and fully adjusted for age, sex, race, and income (Model 3). All regression functions (build_formula(), calculate_ci(), format_results(), and perform_regression()) are sourced from functions.r.

# ===============================================
# Regression Analysis
# ===============================================
# Note: All regression functions are sourced from functions.r:
# - build_formula(): Constructs regression formulas
# - calculate_ci(): Calculates confidence intervals using Wald method
# - format_results(): Formats regression results with ORs and CIs
# - perform_regression(): Main function that runs three models

load("data/data_clean.RData")

# Run analysis for each year period
results_list <- lapply(c("2003-2006", "2011-2014"), function(period) {
  # Filter data for the period
  df_period <- df %>% filter(year == period)
  
  # Create survey design for the period
  nhs <- svydesign(data=df_period, ids=~SDMVPSU, strata = ~SDMVSTRA,
                   weights=~WTINT2YR, nest = TRUE)
  
  # Run analysis for all activity variables
  results <- do.call(rbind, lapply(c("physical_activity", "physical_activity_day", "physical_activity_night"), 
                                  function(var) {
                                    res <- perform_regression(var, nhs)
                                    res$Activity <- rep(c("All", "Day", "Night")[match(var, c("physical_activity", "physical_activity_day", "physical_activity_night"))], 
                                                      nrow(res))
                                    res$Year <- period
                                    res
                                  }))
})

# Combine results from both periods
results <- do.call(rbind, results_list)

# Save results
write.csv(results, file = "result/regression.csv", row.names = FALSE)

# ===============================================
# Regression Analysis for subgroup
# ===============================================
# Note: perform_subgroup_regression() function is sourced from functions.r
# Some subgroups use regular logistic regression instead of survey-weighted analysis
# due to insufficient PSUs (Primary Sampling Units) within strata after stratification.
# The function automatically falls back to regular logistic regression when needed.

# age group
df = df %>%
  mutate(
    Age_group = cut(Age, 
                   breaks = c(12, 15, 17, 19),
                   labels = c("12-14", "15-16", "17-19"),
                   include.lowest = TRUE)
  )

# Run subgroup analysis for each year period
subgroup_results_list <- lapply(c("2003-2006", "2011-2014"), function(period) {
  # Filter data for the period
  df_period <- df %>% filter(year == period)
  
  # Run analysis for each subgroup variable
  subgroup_vars <- c("Age_group", "Sex", "Race", "PIR")
  results <- lapply(subgroup_vars, function(subgroup_var) {
    # Get unique values of subgroup variable
    if (subgroup_var == "Age_group") {
      subgroups <- levels(df_period$Age_group)
    } else {
      subgroups <- levels(df_period[[subgroup_var]])
    }
    
    # Run analysis for each subgroup
    subgroup_results <- lapply(subgroups, function(subgroup) {
      # Filter data for the subgroup
      if (subgroup_var == "Age_group") {
        subgroup_data <- df_period[df_period$Age_group == subgroup, ]
      } else {
        subgroup_data <- df_period[df_period[[subgroup_var]] == subgroup, ]
      }
      
      # Skip if subgroup is too small
      if (nrow(subgroup_data) < 20) {
        print(paste("Skipping subgroup", subgroup, "due to small sample size"))
        return(NULL)
      }
      
      # Run analysis for all activity variables
      tryCatch({
        activity_results <- do.call(rbind, lapply(c("physical_activity", "physical_activity_day", "physical_activity_night"), 
                                                function(var) {
                                                  res <- perform_subgroup_regression(var, subgroup_var, subgroup_data)
                                                  res$Activity <- rep(c("All", "Day", "Night")[match(var, c("physical_activity", "physical_activity_day", "physical_activity_night"))], 
                                                                    nrow(res))
                                                  res$Subgroup <- as.character(subgroup)
                                                  res$Subgroup_Var <- subgroup_var
                                                  res
                                                }))
      }, error = function(e) {
        print(paste("Error in subgroup", subgroup, ":", e$message))
        return(NULL)
      })
    })
    
    # Remove NULL results
    subgroup_results <- subgroup_results[!sapply(subgroup_results, is.null)]
    if (length(subgroup_results) == 0) return(NULL)
    do.call(rbind, subgroup_results)
  })
  
  # Remove NULL results
  results <- results[!sapply(results, is.null)]
  if (length(results) == 0) return(NULL)
  results_df <- do.call(rbind, results)
  results_df$Year <- period
  results_df
})

# Remove NULL results and combine
subgroup_results_list <- subgroup_results_list[!sapply(subgroup_results_list, is.null)]
subgroup_results <- do.call(rbind, subgroup_results_list)

# Save results
write.csv(subgroup_results, file = "result/regression_subgroup.csv", row.names = FALSE)

Visualization

This section creates visualizations to explore the relationship between physical activity and MAFLD. The plotting functions (create_violin_plot() and my_theme) are sourced from functions.r.

# ===============================================
# Plot: Hourly physical activity patterns
# ===============================================
# For visualization purposes, values above the 99th percentile were excluded.

# Reshape data for plotting
df_long <- df %>%
  select(SEQN, year, MAFLD, starts_with("pa_")) %>%
  pivot_longer(
    cols = starts_with("pa_"),
    names_to = "hour",
    values_to = "mean_physical_activity"
  ) %>%
  mutate(hour = as.numeric(gsub("pa_", "", hour)))

# Calculate 99th percentile for each period and MAFLD status
quantiles <- df_long %>%
  group_by(year, MAFLD) %>%
  summarise(q99 = quantile(mean_physical_activity, 0.99, na.rm = TRUE), .groups = "drop")


# Create violin plots for both periods
violin_plots <- list(
  # 2003-2006 (no x-label, no legend)
  "2003_2006" = create_violin_plot(
    filter(df_long, year == "2003-2006",
           mean_physical_activity <= filter(quantiles, year == "2003-2006", MAFLD == ifelse(MAFLD == 1, 1, 0))$q99),
    "2003-2006", show_x_label = FALSE, show_legend = FALSE),
  
  # 2011-2014 (show x-label and legend)
  "2011_2014" = create_violin_plot(
    filter(df_long, year == "2011-2014",
           mean_physical_activity <= filter(quantiles, year == "2011-2014", MAFLD == ifelse(MAFLD == 1, 1, 0))$q99),
    "2011-2014", show_x_label = TRUE, show_legend = TRUE)
)

# Combine violin plots
violin_combined <- grid.arrange(grobs = violin_plots, ncol = 1, 
                               padding = unit(0.5, "lines"))

# ===============================================
# Plot: MAFLD proportion by physical activity level
# ===============================================
# Note: my_theme is sourced from functions.r

prop_data <- df %>%
  pivot_longer(
    cols = c(physical_activity, physical_activity_day, physical_activity_night),
    names_to = "activity_type",
    values_to = "activity_level"
  ) %>%
  mutate(
    time_period = case_when(
      activity_type == "physical_activity" ~ "Overall",
      activity_type == "physical_activity_day" ~ "Daytime",
      activity_type == "physical_activity_night" ~ "Nighttime"
    ),
    time_period = factor(time_period, levels = c("Overall", "Daytime", "Nighttime")),
    activity_level = factor(activity_level, levels = c("Q1", "Q2", "Q3", "Q4"))
  ) %>%
  group_by(year, time_period, activity_level) %>%
  summarise(
    total = n(),
    mafld = sum(MAFLD == 1),
    non_mafld = total - mafld,
    .groups = "drop"
  ) %>%
  pivot_longer(
    cols = c(mafld, non_mafld),
    names_to = "status",
    values_to = "count"
  ) %>%
  group_by(year, time_period, activity_level) %>%
  mutate(
    proportion = count / sum(count),
    status = factor(status, levels = c("non_mafld", "mafld"),
                    labels = c("Non-MAFLD", "MAFLD"))
  )

# Create proportion plot
proportion_plot <- ggplot(prop_data, aes(x = activity_level, y = proportion, fill = status)) +
  geom_bar(stat = "identity", width = 0.8) +
  geom_text(aes(label = scales::percent(proportion, accuracy = 1)),
            position = position_stack(vjust = 0.5),
            size = 5, color = "white") +
  facet_grid(time_period ~ year) +
  scale_fill_manual(values = c("Non-MAFLD" = "#4A9B7E", "MAFLD" = "#D2603A")) +
  scale_y_continuous(labels = scales::percent_format(accuracy = 1),
                     breaks = seq(0, 1, 0.25)) +
  labs(x = "  Activity counts                 MIMS-units",
       y = "Proportion",
       fill = "Status") +
  my_theme +
  theme(plot.title = element_text(size = 18, hjust = 0.5, family = "Helvetica"))

# ===============================================
# Combine both plots into one plot
# ===============================================
# Combine all plots
combined_comprehensive_plot <- grid.arrange(
  violin_combined,
  proportion_plot,
  ncol = 2,
  widths = c(3, 2),
  padding = unit(1, "lines")
)

# Save the combined plot
ggsave("plot/combined_pa_mafld_analysis.png", combined_comprehensive_plot,
       width = 16, height = 6, dpi = 600, bg = "white")

Comments and feedbacks

Find me via [email protected]

Haoyang Zhang

Histogram