#----------------------------------------------------------------------------------------------------------------------------------------------------#
# Unit_Level_Joinpoint_CompSurv.R
# Created by Joe Zou (11/21/2025)
# Provided by The National Cancer Institute & Information Management Services, Inc.
# The program was developed from the project described in Liu et al. (2025). 
# The program can be used to analyze trend of complex survey data using unit-level joinpoint regression model. 
# The program defines two functions:                                                                        
#  1) unit_level_n_jp: This function finds the best joinpoint model with given number of joinpoints (=n_jp) for complex survey data. 
#  2) unit_level: This function calls for function 'unit_level_n_jp' and finds the best joinpoint model for complex survey data.
# Users need to read in their data, define the required parameters, and call the 'unit-level' function.
# An example of trend analysis using the BMI data from the National Health Interview Survey (NHIS)(1991–2016) is provided at the end of the program. 
#----------------------------------------------------------------------------------------------------------------------------------------------------#


# ------   Definition of the function 'unit_level_n_jp'   ------ #
# unit_level_n_jp: find the best joinpoint model with number of joinpoints=n_jp for complex survey data
# Input:
#   input_data      - data frame containing all variables needed
#   strata          - name of the strata variable
#   psu             - name of the psu variable
#   weight          - name of the weight variable
#   year            - name of the year variable
#   model_type      - 0: continuous outcome; otherwise, binary outcome
#   dep_var         - name of the dependent variable.  For binary outcome, event must be 1
#   min_year        - start year
#   max_year        - end year
#   min_obs_to_end  - minimum number of observations from a joinpoint to either end of the data 
#   min_obs_between - minimum number of observations between two joinpoints
#
# Output:
#   A list that contains model information

unit_level_n_jp<-function(n_jp,input_data,strata,psu,weight,year,model_type,dep_var,min_year,max_year,min_obs_to_end,min_obs_between) {
  input_data<-input_data[input_data[,year]>=min_year & input_data[,year]<=max_year,]
  input_data[[year]]<-input_data[[year]]-min_year
  max_year<-max_year-min_year
  min_year<-0
  nobs<-dim(input_data)[1]
  if (n_jp==0) {
    dclus<-svydesign(ids=as.formula(paste0("~",psu)), strata=as.formula(paste0("~",strata)), nest=T, weights=as.formula(paste0("~",weight)), data=input_data)
    if (model_type==0) {
      model<-svyglm(as.formula(paste0(dep_var,"~",year)), design=dclus)
      naive_cov<-as.matrix(model$naive.cov[c(-1),c(-1)])
      covar<-as.matrix(vcov(model)[c(-1),c(-1)])
      mse<-as.data.frame(summary(model)$dispersion)[1,1]*nobs/(nobs-1)
      delta_bar_hat<-ifelse(det(naive_cov)==0,mean(eigen(ginv(naive_cov)%*%covar)$values), mean(eigen(solve(naive_cov)%*%covar)$values))
      dAIC<-nobs*mse+(4*n_jp+2)*delta_bar_hat  
    } else {
      model<-svyglm(as.formula(paste0(dep_var,"~",year)), design=dclus, family=quasibinomial())
      temp<-model$coefficients[1]+input_data[[year]]*model$coefficients[2]
      temp<-exp(temp)
      best<- -2*sum(input_data[[weight]]*log((temp/(1+temp))^input_data[[dep_var]]*(1/(1+temp))^(1-input_data[[dep_var]])))  
      naive_cov<-as.matrix(model$naive.cov[c(-1),c(-1)])
      covar<-as.matrix(vcov(model)[c(-1),c(-1)])
      delta_bar_hat<-ifelse(det(naive_cov)==0,sum(diag(ginv(naive_cov)%*%covar))/(n_jp+1),sum(diag(solve(naive_cov)%*%covar))/(n_jp+1)) 
      dAIC<-nobs/sum(input_data[[weight]])*best+(4*n_jp+2)*delta_bar_hat  
    }
    result<-list("distinct"=distinct(input_data,input_data[[year]],.keep_all=TRUE),"n_jp"=n_jp,"dAIC"=dAIC,"constrained"=model,"unconstrained"=model,"df"=degf(dclus))
  } else {
    for (i in (min_year+min_obs_to_end):(max_year-min_obs_to_end)) {
      input_data[[paste0("covar_",i)]]<-ifelse(input_data[[year]]>i,input_data[[year]]-i,0)
    }
    input_data<-input_data[order(input_data[[strata]],input_data[[psu]]),]
    combinations<-as.data.frame((min_year+min_obs_to_end):(max_year-min_obs_to_end-(n_jp-1)*3))
    colnames(combinations)<-'year1'
    if (n_jp>1) {
      for (i in 2:n_jp) {
        combinations<-as.data.frame(expand_grid(combinations,(min_year+min_obs_to_end):(max_year-min_obs_to_end-(n_jp-i)*(min_obs_between+1))))
        colnames(combinations)[i]<-paste0('year',i)
        combinations<-combinations[combinations[,i]>=combinations[,i-1]+min_obs_between+1,]
      }
    }
    dclus<-svydesign(ids=as.formula(paste0("~",psu)), strata=as.formula(paste0("~",strata)), nest=T, weights=as.formula(paste0("~",weight)), data=input_data)
    best<-ifelse(model_type==0,0,9e99)
    for (i in 1:dim(combinations)[1]){
      tempchar<-""
      for (j in 1:n_jp) {
        tempchar<-paste0(tempchar,"+covar_",combinations[i,j])
      }
      if (model_type==0) {
        model<-svyglm(as.formula(paste0(dep_var,"~",year,tempchar)), design=dclus)
        r_square<-1-model$deviance/model$null.deviance
        if (r_square>best) {
          best<-r_square
          best_i<-i
        }  
      } else {
        model<-svyglm(as.formula(paste0(dep_var,"~",year,tempchar)), design=dclus, family=quasibinomial())
        temp<-model$coefficients[1]+input_data[[year]]*model$coefficients[2]
        for (j in 1:n_jp) {
          temp<-temp+input_data[[paste0("covar_",combinations[i,j])]]*model$coefficients[j+2]
        }
        temp<-exp(temp)
        neg2logl<- -2*sum(input_data[[weight]]*log((temp/(1+temp))^input_data[[dep_var]]*(1/(1+temp))^(1-input_data[[dep_var]])))  
        if (neg2logl<best) {
          best<-neg2logl
          best_i<-i
        }  
      }
    }
    input_data[["x_1"]]<-ifelse(input_data[[year]]<combinations[best_i,1],1,0)
    input_data[["x_2"]]<-input_data[["x_1"]]*input_data[[year]]
    if (n_jp>1) {
      for (i in 2:n_jp) {
        input_data[[paste0("x_",2*i-1)]]<-ifelse(input_data[[year]]>combinations[best_i,i-1] & input_data[[year]]<combinations[best_i,i],1,0)
        input_data[[paste0("x_",2*i)]]<-input_data[[paste0("x_",2*i-1)]]*input_data[[year]]
      } 
    }
    input_data[[paste0("x_",2*n_jp+1)]]<-ifelse(input_data[[year]]>combinations[best_i,n_jp],1,0)
    input_data[[paste0("x_",2*n_jp+2)]]<-input_data[[paste0("x_",2*n_jp+1)]]*input_data[[year]]
    dclus<-svydesign(ids=as.formula(paste0("~",psu)), strata=as.formula(paste0("~",strata)), nest=T, weights=as.formula(paste0("~",weight)), data=input_data)
    tempchar<-""
    for (j in 1:n_jp) {
      tempchar<-paste0(tempchar,"+covar_",combinations[best_i,j])
    }
    tempchar2<-"x_1"
    for (j in 2:(2*n_jp+2)) {
      tempchar2<-paste0(tempchar2,"+x_",j)
    }
    matrix_a<-rbind(c(1,0,0,0),c(0,1,0,0),c(0,-1,0,1))
    if (n_jp>1) {
      for (i in 2:n_jp) {
        matrix_a<-cbind(matrix_a,rep(0,nrow(matrix_a)),rep(0,nrow(matrix_a)))
        matrix_a<-rbind(matrix_a,rep(0,ncol(matrix_a)))
        matrix_a[i+2,2*i]<- -1
        matrix_a[i+2,2*i+2]<- 1
      }
    }
    if (model_type==0) {
      model_constrained<-svyglm(as.formula(paste0(dep_var,"~",year,tempchar)), design=dclus)
      naive_cov<-model_constrained$naive.cov[c(-1),c(-1)]
      covar<-vcov(model_constrained)[c(-1),c(-1)]
      mse<-as.data.frame(summary(model_constrained)$dispersion)[1,1]*nobs/(nobs-1-n_jp)
      delta_bar_hat<-ifelse(det(naive_cov)==0,mean(eigen(ginv(naive_cov)%*%covar)$values), mean(eigen(solve(naive_cov)%*%covar)$values))
      dAIC<-nobs*mse+(4*n_jp+2)*delta_bar_hat
      
      model<-svyglm(as.formula(paste0(dep_var,"~",tempchar2)), design=dclus)
      naive_cov<-model$naive.cov[c(-1),c(-1)]
      model<-svyglm(as.formula(paste0(dep_var,"~",tempchar2,"-1")), design=dclus)
      covar<-vcov(model)
      covar<-matrix_a%*%covar%*%t(matrix_a)
      covar<-covar[c(-1),c(-1)]
      naive_cov<-matrix_a%*%naive_cov%*%t(matrix_a)
      naive_cov<-naive_cov[c(-1),c(-1)]
      un_delta_bar_hat<-ifelse(det(naive_cov)==0,mean(eigen(ginv(naive_cov)%*%covar)$values), mean(eigen(solve(naive_cov)%*%covar)$values))
      un_dAIC<-nobs*mse+(4*n_jp+2)*un_delta_bar_hat
    } else {
      model_constrained<-svyglm(as.formula(paste0(dep_var,"~",year,tempchar)), design=dclus, family=quasibinomial())
      naive_cov<-model_constrained$naive.cov[c(-1),c(-1)]
      covar<-vcov(model_constrained)[c(-1),c(-1)]
      delta_bar_hat<-ifelse(det(naive_cov)==0,sum(diag(ginv(naive_cov)%*%covar))/(n_jp+1),sum(diag(solve(naive_cov)%*%covar))/(n_jp+1)) 
      dAIC<-nobs/sum(input_data[[weight]])*best+(4*n_jp+2)*delta_bar_hat
      model<-svyglm(as.formula(paste0(dep_var,"~",tempchar2)), design=dclus, family=quasibinomial())
      naive_cov<-model$naive.cov[c(-1),c(-1)]
      model<-svyglm(as.formula(paste0(dep_var,"~",tempchar2,"-1")), design=dclus, family=quasibinomial())
      covar<-vcov(model)
      covar<-matrix_a%*%covar%*%t(matrix_a)
      covar<-covar[c(-1),c(-1)]
      naive_cov<-matrix_a%*%naive_cov%*%t(matrix_a)
      naive_cov<-naive_cov[c(-1),c(-1)]
      un_delta_bar_hat<-ifelse(det(naive_cov)==0,sum(diag(ginv(naive_cov)%*%covar))/(n_jp+1),sum(diag(solve(naive_cov)%*%covar))/(n_jp+1)) 
      un_dAIC<-nobs/sum(input_data[[weight]])*best+(4*n_jp+2)*un_delta_bar_hat
    }
    result<-list("distinct"=distinct(input_data,input_data[[year]],.keep_all=TRUE),"n_jp"=n_jp,"dAIC"=dAIC,"un_dAIC"=un_dAIC,"jp"=unlist(combinations[best_i,]),"constrained"=model_constrained,"unconstrained"=model,"df"=degf(dclus))
  }
  return(result)
}


# ------   Definition of the function 'unit_level'   ------ #
# unit_level: find the best joinpoint model for complex survey data
# Input:
#   input_data      - data frame containing all variables needed
#   strata          - name of the strata variable
#   psu             - name of the psu variable
#   weight          - name of the weight variable
#   year            - name of the year variable
#   model_type      - 0: continuous outcome; otherwise, binary outcome
#   dep_var         - name of the dependent variable.  For binary outcome, event must be 1
#   log_transfer    - TRUE or FALSE.  Only applicable if model_type=0
#   min_year        - start year
#   max_year        - end year
#   min_jp          - minimum number of joinpoints
#   max_jp          - maximum number of joinpoints
#   min_obs_to_end  - minimum number of observations from a joinpoint to either end of the data 
#   min_obs_between - minimum number of observations between two joinpoints
#   min_ppd         - minimum percentage points difference to overwirte the default model (number of joinpoints=min_jp).
#                     Only applicaple if (model_type=0 and log_tranfer=TRUE) or (model_type!=0).
#
# Output:
#   A list that contains model information including parameter estimates, APC and its 95% confidence intervals, and AAPC and its 95% confidence intervals

unit_level<-function(input_data,strata,psu,weight,year,model_type,dep_var,log_transfer,min_year,max_year,min_jp,max_jp,min_obs_to_end=2,min_obs_between=2,min_ppd=0) {
  new_dep_var<-dep_var
  if (model_type==0 && log_transfer==TRUE) {
    new_dep_var<-paste0("log_",dep_var)
    input_data[[new_dep_var]]<-log(input_data[[dep_var]])
  }
  for (i in min_jp:max_jp) {
    temp<-unit_level_n_jp(i,input_data,strata,psu,weight,year,model_type,new_dep_var,min_year,max_year,min_obs_to_end,min_obs_between)
    if (i==min_jp) {
      result<-temp
    } else {
      if (temp$dAIC<result$dAIC) {
        if (model_type==0 & log_transfer==TRUE || model_type!=0) {
          temp2<-coef(temp$constrained)
          fail_min_ppd<-0
          old_slope<-temp2[2]
          for (j in 3:(i+2)) {
            new_slope<-old_slope+temp2[j]
            if (100*abs(exp(old_slope)-exp(new_slope))<min_ppd) {
              fail_min_ppd<-1
            }
            old_slope<-new_slope
          }
          if (fail_min_ppd==0) {
            result<-temp
          }
        } else {
          result<-temp
        }  
      }
    }    
  }
  result$distinct<-result$distinct[order(result$distinct[[year]]),]
  result$estimate<-as.data.frame(predict(result$constrained,result$distinct,type="response"))$response
  if (model_type==0 && log_transfer==TRUE) {
    result$estimate<-exp(result$estimate)
  }
  if (model_type==0 & log_transfer==TRUE || model_type!=0) {
    result$APC<-100*exp(cumsum(coef(result$constrained)[2:(2+result$n_jp)])) - 100
    if (result$n_jp==0) {
      tempchar<-c(paste0(min_year,'-',max_year))
    } else {
      tempchar<-NULL
      temp<-c(0,result$jp,max_year-min_year)+min_year
      for (i in 1:(result$n_jp+1)) {
        tempchar<-cbind(tempchar,paste0(temp[i],'-',temp[i+1]))
      }    
    }
    names(result$APC)<-tempchar
    result$APC_Lower95CI<-100*exp(cumsum(coef(result$constrained)[2:(2+result$n_jp)]) - qt(0.975,result$df)*SE(result$unconstrained)[(1:(result$n_jp+1))*2]) - 100
    result$APC_Upper95CI<-100*exp(cumsum(coef(result$constrained)[2:(2+result$n_jp)]) + qt(0.975,result$df)*SE(result$unconstrained)[(1:(result$n_jp+1))*2]) - 100
    names(result$APC_Lower95CI)<-tempchar
    names(result$APC_Upper95CI)<-tempchar

    if (result$n_jp==0) {
      temp<-1
    } else {
      temp<-(c(result$jp,max_year-min_year)-c(0,result$jp))/(max_year-min_year)
    }
    result$AAPC<-100*exp(sum(cumsum(coef(result$constrained)[2:(2+result$n_jp)])*temp))-100
    temp2<-as.vector(t(as.matrix(temp))%*%vcov(result$unconstrained)[(1:(result$n_jp+1))*2,(1:(result$n_jp+1))*2]%*%as.matrix(temp))
    result$AAPC_Lower95CI<- 100*exp(log(result$AAPC/100+1) - qnorm(0.975)*sqrt(temp2))-100
    result$AAPC_Upper95CI<- 100*exp(log(result$AAPC/100+1) + qnorm(0.975)*sqrt(temp2))-100
  }
  result<-result[names(result)!="distinct"]
  if (result$n_jp>0) {
    result$jp<-result$jp+min_year
  }  
  return(result)
}

# ------   Example Analysis using the NHIS 1991-2016 BMI data to analyze trend on BMI using unit-level joinpoint models   ------#
 
# libPaths(c("/rpackage_directory/r_packages", .libPaths()))  #only need this command if running the R program in Linux# 

require(survey)
require(tidyr)
require(dplyr)

# ------   Read in the analysis file and required variables; need to update the folder, the file name, and the variable names to match user's file   ------# 

NHIS_BMI <- read.csv(file='/file_directory/nhis_bmi_1991-2016.csv') 

dim(NHIS_BMI)
NHIS_BMI$obese1<-ifelse(NHIS_BMI$obese=="Yes",1,0)
#NHIS_BMI$weight<-NHIS_BMI$weight/mean(NHIS_BMI$weight)  #normalize the survey weight if needed# 
NHIS_BMI[1:20,]
sum(NHIS_BMI$weight)
NHIS_BMI$l_BMI_R<-log(NHIS_BMI$BMI_R)


# ------   Call the unit_level function and run the analysis; need to update the variable names to match those in user's file   ------#

result<-unit_level(input_data=NHIS_BMI,strata="STRATA",psu="PSU",weight="weight",year="YEAR",model_type=0,dep_var="BMI_R",log_transfer=TRUE,min_year=1991,max_year=2016,
                   min_jp=0,max_jp=4,min_obs_to_end=2,min_obs_between=2,min_ppd=0)
result

result<-unit_level(input_data=NHIS_BMI,strata="STRATA",psu="PSU",weight="weight",year="YEAR",model_type=1,dep_var="obese1",log_transfer=FALSE,min_year=1991,max_year=2016,
                   min_jp=0,max_jp=4,min_obs_to_end=2,min_obs_between=2,min_ppd=0)
result

result<-unit_level(input_data=NHIS_BMI,strata="STRATA",psu="PSU",weight="weight",year="YEAR",model_type=0,dep_var="BMI_R",log_transfer=TRUE,min_year=1991,max_year=2016,
                   min_jp=0,max_jp=4,min_obs_to_end=2,min_obs_between=2,min_ppd=0.3)
result