#----------------------------------------------------------------------------------------------------------------------------------------------------# # Unit_Level_Joinpoint_CompSurv.R # Created by Joe Zou (11/21/2025) # Provided by The National Cancer Institute & Information Management Services, Inc. # The program was developed from the project described in Liu et al. (2025). # The program can be used to analyze trend of complex survey data using unit-level joinpoint regression model. # The program defines two functions: # 1) unit_level_n_jp: This function finds the best joinpoint model with given number of joinpoints (=n_jp) for complex survey data. # 2) unit_level: This function calls for function 'unit_level_n_jp' and finds the best joinpoint model for complex survey data. # Users need to read in their data, define the required parameters, and call the 'unit-level' function. # An example of trend analysis using the BMI data from the National Health Interview Survey (NHIS)(1991–2016) is provided at the end of the program. #----------------------------------------------------------------------------------------------------------------------------------------------------# # ------ Definition of the function 'unit_level_n_jp' ------ # # unit_level_n_jp: find the best joinpoint model with number of joinpoints=n_jp for complex survey data # Input: # input_data - data frame containing all variables needed # strata - name of the strata variable # psu - name of the psu variable # weight - name of the weight variable # year - name of the year variable # model_type - 0: continuous outcome; otherwise, binary outcome # dep_var - name of the dependent variable. For binary outcome, event must be 1 # min_year - start year # max_year - end year # min_obs_to_end - minimum number of observations from a joinpoint to either end of the data # min_obs_between - minimum number of observations between two joinpoints # # Output: # A list that contains model information unit_level_n_jp<-function(n_jp,input_data,strata,psu,weight,year,model_type,dep_var,min_year,max_year,min_obs_to_end,min_obs_between) { input_data<-input_data[input_data[,year]>=min_year & input_data[,year]<=max_year,] input_data[[year]]<-input_data[[year]]-min_year max_year<-max_year-min_year min_year<-0 nobs<-dim(input_data)[1] if (n_jp==0) { dclus<-svydesign(ids=as.formula(paste0("~",psu)), strata=as.formula(paste0("~",strata)), nest=T, weights=as.formula(paste0("~",weight)), data=input_data) if (model_type==0) { model<-svyglm(as.formula(paste0(dep_var,"~",year)), design=dclus) naive_cov<-as.matrix(model$naive.cov[c(-1),c(-1)]) covar<-as.matrix(vcov(model)[c(-1),c(-1)]) mse<-as.data.frame(summary(model)$dispersion)[1,1]*nobs/(nobs-1) delta_bar_hat<-ifelse(det(naive_cov)==0,mean(eigen(ginv(naive_cov)%*%covar)$values), mean(eigen(solve(naive_cov)%*%covar)$values)) dAIC<-nobs*mse+(4*n_jp+2)*delta_bar_hat } else { model<-svyglm(as.formula(paste0(dep_var,"~",year)), design=dclus, family=quasibinomial()) temp<-model$coefficients[1]+input_data[[year]]*model$coefficients[2] temp<-exp(temp) best<- -2*sum(input_data[[weight]]*log((temp/(1+temp))^input_data[[dep_var]]*(1/(1+temp))^(1-input_data[[dep_var]]))) naive_cov<-as.matrix(model$naive.cov[c(-1),c(-1)]) covar<-as.matrix(vcov(model)[c(-1),c(-1)]) delta_bar_hat<-ifelse(det(naive_cov)==0,sum(diag(ginv(naive_cov)%*%covar))/(n_jp+1),sum(diag(solve(naive_cov)%*%covar))/(n_jp+1)) dAIC<-nobs/sum(input_data[[weight]])*best+(4*n_jp+2)*delta_bar_hat } result<-list("distinct"=distinct(input_data,input_data[[year]],.keep_all=TRUE),"n_jp"=n_jp,"dAIC"=dAIC,"constrained"=model,"unconstrained"=model,"df"=degf(dclus)) } else { for (i in (min_year+min_obs_to_end):(max_year-min_obs_to_end)) { input_data[[paste0("covar_",i)]]<-ifelse(input_data[[year]]>i,input_data[[year]]-i,0) } input_data<-input_data[order(input_data[[strata]],input_data[[psu]]),] combinations<-as.data.frame((min_year+min_obs_to_end):(max_year-min_obs_to_end-(n_jp-1)*3)) colnames(combinations)<-'year1' if (n_jp>1) { for (i in 2:n_jp) { combinations<-as.data.frame(expand_grid(combinations,(min_year+min_obs_to_end):(max_year-min_obs_to_end-(n_jp-i)*(min_obs_between+1)))) colnames(combinations)[i]<-paste0('year',i) combinations<-combinations[combinations[,i]>=combinations[,i-1]+min_obs_between+1,] } } dclus<-svydesign(ids=as.formula(paste0("~",psu)), strata=as.formula(paste0("~",strata)), nest=T, weights=as.formula(paste0("~",weight)), data=input_data) best<-ifelse(model_type==0,0,9e99) for (i in 1:dim(combinations)[1]){ tempchar<-"" for (j in 1:n_jp) { tempchar<-paste0(tempchar,"+covar_",combinations[i,j]) } if (model_type==0) { model<-svyglm(as.formula(paste0(dep_var,"~",year,tempchar)), design=dclus) r_square<-1-model$deviance/model$null.deviance if (r_square>best) { best<-r_square best_i<-i } } else { model<-svyglm(as.formula(paste0(dep_var,"~",year,tempchar)), design=dclus, family=quasibinomial()) temp<-model$coefficients[1]+input_data[[year]]*model$coefficients[2] for (j in 1:n_jp) { temp<-temp+input_data[[paste0("covar_",combinations[i,j])]]*model$coefficients[j+2] } temp<-exp(temp) neg2logl<- -2*sum(input_data[[weight]]*log((temp/(1+temp))^input_data[[dep_var]]*(1/(1+temp))^(1-input_data[[dep_var]]))) if (neg2logl1) { for (i in 2:n_jp) { input_data[[paste0("x_",2*i-1)]]<-ifelse(input_data[[year]]>combinations[best_i,i-1] & input_data[[year]]combinations[best_i,n_jp],1,0) input_data[[paste0("x_",2*n_jp+2)]]<-input_data[[paste0("x_",2*n_jp+1)]]*input_data[[year]] dclus<-svydesign(ids=as.formula(paste0("~",psu)), strata=as.formula(paste0("~",strata)), nest=T, weights=as.formula(paste0("~",weight)), data=input_data) tempchar<-"" for (j in 1:n_jp) { tempchar<-paste0(tempchar,"+covar_",combinations[best_i,j]) } tempchar2<-"x_1" for (j in 2:(2*n_jp+2)) { tempchar2<-paste0(tempchar2,"+x_",j) } matrix_a<-rbind(c(1,0,0,0),c(0,1,0,0),c(0,-1,0,1)) if (n_jp>1) { for (i in 2:n_jp) { matrix_a<-cbind(matrix_a,rep(0,nrow(matrix_a)),rep(0,nrow(matrix_a))) matrix_a<-rbind(matrix_a,rep(0,ncol(matrix_a))) matrix_a[i+2,2*i]<- -1 matrix_a[i+2,2*i+2]<- 1 } } if (model_type==0) { model_constrained<-svyglm(as.formula(paste0(dep_var,"~",year,tempchar)), design=dclus) naive_cov<-model_constrained$naive.cov[c(-1),c(-1)] covar<-vcov(model_constrained)[c(-1),c(-1)] mse<-as.data.frame(summary(model_constrained)$dispersion)[1,1]*nobs/(nobs-1-n_jp) delta_bar_hat<-ifelse(det(naive_cov)==0,mean(eigen(ginv(naive_cov)%*%covar)$values), mean(eigen(solve(naive_cov)%*%covar)$values)) dAIC<-nobs*mse+(4*n_jp+2)*delta_bar_hat model<-svyglm(as.formula(paste0(dep_var,"~",tempchar2)), design=dclus) naive_cov<-model$naive.cov[c(-1),c(-1)] model<-svyglm(as.formula(paste0(dep_var,"~",tempchar2,"-1")), design=dclus) covar<-vcov(model) covar<-matrix_a%*%covar%*%t(matrix_a) covar<-covar[c(-1),c(-1)] naive_cov<-matrix_a%*%naive_cov%*%t(matrix_a) naive_cov<-naive_cov[c(-1),c(-1)] un_delta_bar_hat<-ifelse(det(naive_cov)==0,mean(eigen(ginv(naive_cov)%*%covar)$values), mean(eigen(solve(naive_cov)%*%covar)$values)) un_dAIC<-nobs*mse+(4*n_jp+2)*un_delta_bar_hat } else { model_constrained<-svyglm(as.formula(paste0(dep_var,"~",year,tempchar)), design=dclus, family=quasibinomial()) naive_cov<-model_constrained$naive.cov[c(-1),c(-1)] covar<-vcov(model_constrained)[c(-1),c(-1)] delta_bar_hat<-ifelse(det(naive_cov)==0,sum(diag(ginv(naive_cov)%*%covar))/(n_jp+1),sum(diag(solve(naive_cov)%*%covar))/(n_jp+1)) dAIC<-nobs/sum(input_data[[weight]])*best+(4*n_jp+2)*delta_bar_hat model<-svyglm(as.formula(paste0(dep_var,"~",tempchar2)), design=dclus, family=quasibinomial()) naive_cov<-model$naive.cov[c(-1),c(-1)] model<-svyglm(as.formula(paste0(dep_var,"~",tempchar2,"-1")), design=dclus, family=quasibinomial()) covar<-vcov(model) covar<-matrix_a%*%covar%*%t(matrix_a) covar<-covar[c(-1),c(-1)] naive_cov<-matrix_a%*%naive_cov%*%t(matrix_a) naive_cov<-naive_cov[c(-1),c(-1)] un_delta_bar_hat<-ifelse(det(naive_cov)==0,sum(diag(ginv(naive_cov)%*%covar))/(n_jp+1),sum(diag(solve(naive_cov)%*%covar))/(n_jp+1)) un_dAIC<-nobs/sum(input_data[[weight]])*best+(4*n_jp+2)*un_delta_bar_hat } result<-list("distinct"=distinct(input_data,input_data[[year]],.keep_all=TRUE),"n_jp"=n_jp,"dAIC"=dAIC,"un_dAIC"=un_dAIC,"jp"=unlist(combinations[best_i,]),"constrained"=model_constrained,"unconstrained"=model,"df"=degf(dclus)) } return(result) } # ------ Definition of the function 'unit_level' ------ # # unit_level: find the best joinpoint model for complex survey data # Input: # input_data - data frame containing all variables needed # strata - name of the strata variable # psu - name of the psu variable # weight - name of the weight variable # year - name of the year variable # model_type - 0: continuous outcome; otherwise, binary outcome # dep_var - name of the dependent variable. For binary outcome, event must be 1 # log_transfer - TRUE or FALSE. Only applicable if model_type=0 # min_year - start year # max_year - end year # min_jp - minimum number of joinpoints # max_jp - maximum number of joinpoints # min_obs_to_end - minimum number of observations from a joinpoint to either end of the data # min_obs_between - minimum number of observations between two joinpoints # min_ppd - minimum percentage points difference to overwirte the default model (number of joinpoints=min_jp). # Only applicaple if (model_type=0 and log_tranfer=TRUE) or (model_type!=0). # # Output: # A list that contains model information including parameter estimates, APC and its 95% confidence intervals, and AAPC and its 95% confidence intervals unit_level<-function(input_data,strata,psu,weight,year,model_type,dep_var,log_transfer,min_year,max_year,min_jp,max_jp,min_obs_to_end=2,min_obs_between=2,min_ppd=0) { new_dep_var<-dep_var if (model_type==0 && log_transfer==TRUE) { new_dep_var<-paste0("log_",dep_var) input_data[[new_dep_var]]<-log(input_data[[dep_var]]) } for (i in min_jp:max_jp) { temp<-unit_level_n_jp(i,input_data,strata,psu,weight,year,model_type,new_dep_var,min_year,max_year,min_obs_to_end,min_obs_between) if (i==min_jp) { result<-temp } else { if (temp$dAIC0) { result$jp<-result$jp+min_year } return(result) } # ------ Example Analysis using the NHIS 1991-2016 BMI data to analyze trend on BMI using unit-level joinpoint models ------# # libPaths(c("/rpackage_directory/r_packages", .libPaths())) #only need this command if running the R program in Linux# require(survey) require(tidyr) require(dplyr) # ------ Read in the analysis file and required variables; need to update the folder, the file name, and the variable names to match user's file ------# NHIS_BMI <- read.csv(file='/file_directory/nhis_bmi_1991-2016.csv') dim(NHIS_BMI) NHIS_BMI$obese1<-ifelse(NHIS_BMI$obese=="Yes",1,0) #NHIS_BMI$weight<-NHIS_BMI$weight/mean(NHIS_BMI$weight) #normalize the survey weight if needed# NHIS_BMI[1:20,] sum(NHIS_BMI$weight) NHIS_BMI$l_BMI_R<-log(NHIS_BMI$BMI_R) # ------ Call the unit_level function and run the analysis; need to update the variable names to match those in user's file ------# result<-unit_level(input_data=NHIS_BMI,strata="STRATA",psu="PSU",weight="weight",year="YEAR",model_type=0,dep_var="BMI_R",log_transfer=TRUE,min_year=1991,max_year=2016, min_jp=0,max_jp=4,min_obs_to_end=2,min_obs_between=2,min_ppd=0) result result<-unit_level(input_data=NHIS_BMI,strata="STRATA",psu="PSU",weight="weight",year="YEAR",model_type=1,dep_var="obese1",log_transfer=FALSE,min_year=1991,max_year=2016, min_jp=0,max_jp=4,min_obs_to_end=2,min_obs_between=2,min_ppd=0) result result<-unit_level(input_data=NHIS_BMI,strata="STRATA",psu="PSU",weight="weight",year="YEAR",model_type=0,dep_var="BMI_R",log_transfer=TRUE,min_year=1991,max_year=2016, min_jp=0,max_jp=4,min_obs_to_end=2,min_obs_between=2,min_ppd=0.3) result