# This function computes statstics based on "Some tests for Comparing Cumulative Incidence Functions # and Cause-Specific Hazard Rates" by Aly, Kochar, and McKeague (JASA Sept. 1994, Vol. 89, No. 427, pg 994) # Function for competing hazard rate estimation. # Based on "Some Tests for Comparing Cumulative Incidence Functions and Cause-Specific Hazard Rates" # the alternative hpothesis is F_1(t) \leq F_2(t) with strict inequality for some t>0. # This is setup for two causes of failure the arguments should be as follows: # # observed.times -- a vector of time of failure or in the censoring case the # minimum of failure time and censoring time # # observed.cause -- a vector containing 1's for cause 1 and 2's for cause 2 and 0's for right # censoring # # all.stats -- forces D_3n and D_4n to be output even in the uncensored case # # alterative -- "1<=2" implies tesing F_1(t) \leq F_2(t) and g_1(t) \leq g_2(t), "2<=1" implies the opposite one sided test competing.hazard <- function(observed.times, observed.cause, all.stats=FALSE, alternative="1<=2"){ # Being a good programer and checking function inputs (happens sometimes) if(missing(observed.times)){ stop("The observed.times argument is not optional") } else { # Attempt to Coerce to numeric vector as.vector(observed.times, mode="numeric") if(!is.vector(observed.times)){ stop("Numeric vector expected for argument observed.times") } if(sum(is.na(observed.times))>0){ stop("The observed.times vector cannot contain NA values") } } if(missing(observed.cause)){ stop("The observed.cause argument is not optional") } else { # Need to verify only 0, 1, or 2 as elements in the vector -- This handles NA too n <- length(observed.cause) if(sum(observed.cause %in% c(0, 1, 2)) < n){ stop("Value in observed.cause vector other than 0, 1, or 2") } } # Verify that the vectors are of the same length if(length(observed.times) != n){ stop("The observed.times and observed.cause vectors should be the same length") } if(!(all.stats==TRUE || all.stats==FALSE)){ stop("The all.stats argument must be a logical value") } if(alternative=="2<=1"){ observed.cause <- (observed.cause >= 0)*(3 - observed.cause) } # Check if there is censoring if(sum(observed.cause==0) > 0){ censoring = TRUE } else { censoring = FALSE } # Setup empty list object to output function data output <- list() # Internal functions used in D_2n and D_4n statistics ugly.cdf <- function(t,n){ j <- 1:(2*t) messy_vec <- ( cos( (j*pi) / (2*t + 1) )^n * sin( (j*pi*(t + 1)) / (2*t + 1) ) * (1 + cos( (j*pi) / (2*t+1) ) ) * ((1-(-1)^j)/2) ) / sin( (j*pi) / (2*t + 1) ) # Formula differs from original source paper -- confirmed typo in paper with author # used formula from "Testing for change-points with rank and sign statistics" by Gombay # (Statistics & Probability Letters 20, 1994, 49-55) output <- 2/(2*t + 1) * sum(messy_vec) return(output) } asymp.cdf <- function(c){ #In theory sum goes to infinty but 5K seems close enough k <- 0:5000 output <- 4/pi * sum((-1)^k/(2*k+1) * exp(-pi^2*(2*k+1)^2/(8*c^2))) return(output) } if(!censoring){ #Create cumulative incidence functions cif_1 <- cumsum(observed.cause == 1)/n cif_2 <- cumsum(observed.cause == 2)/n #Calculate first uncensored statistic D_1n <- max(0,cif_2 - cif_1) #Get exact p-value k <- n*D_1n #Need prob n*D_1n > k exact_p_val <- sum(1/(2^n)*choose(n,floor((n-k:n)/2))) #Get asymptotic p-value asymp_p_val <- 2*(1-pnorm(sqrt(n)*D_1n)) output$D1n <- list(statistic=D_1n, exact.pval=exact_p_val, asymp.pval=asymp_p_val) #Calculate second uncensored statistic psi_n <- c(0,cif_2 - cif_1) sups <- numeric(n) for(i in 2:n){ cur_val <- psi_n[i] sups[i] <- max(cur_val - psi_n[1:(i-1)]) } D_2n <- max(sups) #Get exact p-value exact_p_val <- 1 - ugly.cdf(n*D_2n,n) #Get asymptotic p-value asymp_p_val <- 1 - asymp.cdf(sqrt(n)*D_2n) output$D2n <- list(statistic=D_2n, exact.pval=exact_p_val, asymp.pval= asymp_p_val) } if(censoring || all.stats){ #The functions below can also be accomplished by the survival library # since they were short enough I went ahead and coded them but compared to that library. # Avoids dependencies, which are both good and bad.... #Matched this function to results from survival library # library(survival) # fit <- survfit(Surv(time, status) ~ 1, data = aml) # cumprod((fit$n.risk - fit$n.event)/fit$n.risk) # emp.prod.lim(aml$time,aml$status) emp.prod.lim <- function(observed.times, event.ind){ n <- length(observed.times) #Order in case not passed that way -- Mostly for testing since this is an internal function time_order <- order(observed.times) event.ind <- event.ind[time_order] observed.times <- observed.times[time_order] unique_times <- unique(observed.times) n_unique <- length(unique_times) num_events <- tabulate(match(observed.times,unique_times)*event.ind, nbins = n_unique) #Create risk set vector for time t-1 -- that is shifted left one place # so time 0 is in the first place of the vector num_fails <- tabulate(match(observed.times,unique_times)) aggregate_fails <- cumsum(num_fails) risk_set <- c(n,n - aggregate_fails[1:(n_unique-1)]) prod_lim <- cumprod((risk_set - num_events)/risk_set) return(prod_lim) } #Matched this function to results from survival library # library(survival) # fit <- survfit(Surv(time, status) ~ 1, data = aml) # cumsum(fit$n.event/fit$n.risk) # emp.cshr(aml$time[order(aml$time)],aml$status[order(aml$time)]) emp.cshr <- function(observed.times, cause.ind){ n <- length(observed.times) #Order in case not passed that way -- Mostly for testing since this is an internal function time_order <- order(observed.times) cause.ind <- cause.ind[time_order] observed.times <- observed.times[time_order] unique_times <- unique(observed.times) n_unique <- length(unique_times) num_cause_fails <- tabulate(match(observed.times,unique_times)*cause.ind, nbins = n_unique) aggregate_cause_fails <- cumsum(num_cause_fails) #Create risk set vector for time t-1 -- that is shifted left one place # so time 0 is in the first place of the vector num_fails <- tabulate(match(observed.times,unique_times)) aggregate_fails <- cumsum(num_fails) risk_set <- c(n,n - aggregate_fails[1:(n_unique-1)]) aalen <- cumsum(num_cause_fails/risk_set) return(aalen) } # Setup indicator vector for censoring n_unique <- length(unique(observed.times)) censor_ind <- (observed.cause == 0) pl_fail <- emp.prod.lim(observed.times,(1-censor_ind)) pl_censor <- emp.prod.lim(observed.times,censor_ind) aalen_1 <- emp.cshr(observed.times,(observed.cause == 1)) aalen_2 <- emp.cshr(observed.times,(observed.cause == 2)) aalen_diff <- (aalen_2 - aalen_1) phi_n <- c(0,cumsum(pl_fail*sqrt(pl_censor)*(aalen_diff - c(0,aalen_diff[1:(n_unique-1)])))) D_3n <- max(phi_n) #Get asymptotic p-value asymp_p_val <- 2*(1-pnorm(sqrt(n)*D_3n)) output$D3n <- list(statistic=D_3n, asymp.pval=asymp_p_val) sups <- numeric(n+1) sups[1] <- phi_n[1] for(i in 2:(n_unique+1)){ cur_val <- phi_n[i] sups[i] <- max(cur_val - phi_n[1:(i-1)]) } D_4n <- max(sups) asymp_p_val <- 1 - asymp.cdf(sqrt(n)*D_4n) output$D4n <- list(statistic=D_4n, asymp.pval=asymp_p_val) } return(output) }