dynamic.require <- function( package ) { if ( eval( parse( text = paste( 'require(', package, ')' ) ) ) ) { return( 'done' ) } install.packages( package ) return( eval( parse( text = paste( 'require(', package, ')' ) ) ) ) } tab.sum <- function( x.1, y ) { # written by Daniel Helkey, edited by David Draper # summarize outcome variable y by levels of the categorical variable x1 # make sure x and y are same length, etc... stopifnot( length( x.1 ) == length( y ) ) # two helper functions summary.function <- function( x ) { return( list( n = length( x ), mean = mean( x, na.rm = TRUE ), sd = sd( x, na.rm = TRUE ) ) ) } map.function <- function( level ) { indices <- ( x.1 == level ) summary.function( y[ indices ] ) } levels <- sort( unique( x.1 ) ) out.matrix <- do.call( rbind, Map( map.function, levels ) ) out.matrix <- cbind( levels, out.matrix ) out.matrix <- rbind( out.matrix, c( 'Total', summary.function( y ) ) ) colnames( out.matrix )[ 1 ] <- as.character( substitute( x.1 ) ) return( out.matrix ) } winsorized.logit <- function( p ) { p.winsorized <- ifelse( p > 0.999, 0.999, p ) p.winsorized <- ifelse( p.winsorized < 0.001, 0.001, p.winsorized ) l <- log( p.winsorized / ( 1 - p.winsorized ) ) return( l ) } logistic.regression.univariate.exploration <- function( x, y, n.bootstrap.replicates, n.cutpoints ) { n <- length( x ) plot( x, y, ylim = c( -7, 7 ), col = 'red', pch = 16, main = 'logit( lowess p.hat ): black and blue curves', ylab = 'y (red dots)' ) for ( i in 1:n.bootstrap.replicates ) { bootstrap.sample.indices <- sample( 1:n, n, replace = T ) bootstrap.lowess.curve <- lowess( x[ bootstrap.sample.indices ], y [ bootstrap.sample.indices ] ) lines( bootstrap.lowess.curve$x, winsorized.logit( bootstrap.lowess.curve$y ), lwd = 1, lty = 3, col = 'black' ) } lowess.curve <- lowess( x, y ) lines( lowess.curve$x, winsorized.logit( lowess.curve$y ), lwd = 2, col = 'blue' ) x.cut <- quantcut( x, n.cutpoints ) print( ' ' ) print( '########################### tab.sum results: ###########################' ) print( ' ' ) print( tab.sum( x.cut, y ) ) print( ' ' ) print( '############ results from logistic regression of y on x ): ############' ) print( ' ' ) glm.y.x <- glm( y ~ x, family = binomial ) print( summary( glm.y.x ) ) print( paste( 'converged?', glm.y.x$converged, 'boundary?', glm.y.x$boundary ) ) print( ' ' ) print( '########### results from logistic regression of y on x^2 ): ###########' ) x.squared <- x^2 glm.y.x.xsquared <- glm( y ~ x + x.squared, family = binomial ) print( summary( glm.y.x.xsquared ) ) print( paste( 'converged?', glm.y.x.xsquared$converged, 'boundary?', glm.y.x.xsquared$boundary ) ) print( ' ' ) print( '############ results from logistic regression of y on x.cut ) #########' ) print( ' ' ) glm.y.x.cut <- glm( y ~ x.cut, family = binomial ) print( summary( glm.y.x.cut ) ) print( paste( 'converged?', glm.y.x.cut$converged, 'boundary?', glm.y.x.cut$boundary ) ) print( ' ' ) print( '######## AIC comparison of x with x^2 and x.cut (smaller is better) ######' ) print( ' ' ) print( paste( 'x AIC', signif( glm( y ~ x, family = binomial )$aic ) ) ) print( paste( 'x and x^2 AIC', signif( glm( y ~ x + x.squared, family = binomial )$aic ) ) ) print( paste( 'x.cut AIC', signif( glm( y ~ x.cut, family = binomial )$aic ) ) ) }