########################################################################### ########################################################################### ## ## ## CONTENTS ## ## ## ## Section ## ## i Workspace formatting code ## ## ii Original data file & compressed file ## ## 1 Models ## ## 2 Characteristics of data set ## ## 3 Sku: BSHRPR20EVSKRT ## ## 4 Sku: ELCNEL202C ## ## 5 Sku: DRLR630002LRT ## ## 6 Time ## ## ## ## ## ########################################################################### ########################################################################### ########################################################################### ########################################################################### ## ## ## SECTION i ## ## Formats the workspace and loads the data file. ## ## ## ########################################################################### ########################################################################### rm(list=ls()); setwd("F:/scottdavis"); getwd(); require(AER); # dispersiontest() install.packages('aod'); require(aod); install.packages('arm'); require(arm); require(boot); require(cars); install.packages('dispmod'); require(dispmod); # glm.poisson.disp install.packages('epicalc') require(epicalc); install.packages('faraway'); require(faraway); require(foreign); install.packages('ggplot2'); require(ggplot2); install.packages('grDevices'); # Boxplot axis stuff require(grDevices); install.packages('Hmisc'); require(Hmisc); install.packages('MASS'); require(MASS); # glm.nb install.packages('MuMIn'); require(MuMIn); # QAIC install.packages('pscl'); require(pscl); install.packages('qgraph'); require(qgraph); install.packages('robust'); require(robust); install.packages('sand'); require(sand); require(sandwich); install.packages('stats'); require(stats); require(vcd); install.packages('vcdExtra'); require(vcdExtra); require(VGAM); load('Cpo_Data_Set__Compressed.R') ########################################################################### ########################################################################### ## ## ## SECTION ii ## ## ## ## -The following read in (1st way tried) the zipped .txt file that ## ## was coverted from the original .xls format using LTF Viewer. ## ## -The 2nd way (utilized) converted straight from original .csv ## ## format. ## ## -Compressed file ## ## ## ########################################################################### ########################################################################### ### 1st Way: # c = unzip(path.expand("C:/Users/Scott Davis/Desktop/Data/scottdavis.zip"), files = NULL, list = FALSE, overwrite = TRUE, # junkpaths = FALSE, exdir = ".") ### 2nd Way: Cpo = read.delim(file = "scottdavis.csv", header=T, quote="\"", sep=";", as.is=T); names(Cpo) = c("Sku","Brand","CategoryLevel1","CategoryLevel2","CategoryLevel3","CategoryLevel4","ProductId","SalesChannel", "Gender","MedianIncome","IncomeRange","HourOfDay","Cordless","ProductClassDesc","Condition","AvgDaysInInventory", "ReturnStatus","OrderRcvdDate","ShipDate","InvoiceDate","QtyOrdered","SalesPricePerUnit","CogsPerUnit","ShipAndHandlingRcvd", "SalesTaxRcvd"); # str(Cpo); head(Cpo); ### Compressed files (Less than 25MB) # save(Cpo, file='Cpo_Data_Set__Compressed.R', compress='xz') ########################################################################### ## ## ## SECTION 1: Models ## ## ## ########################################################################### ########################################################################### ########################################################################### ## ## ## Model 1 ## ## ## ########################################################################### ########################################################################### # Aggregated data set. fulltable_1agg = read.csv(file='Aggregated Data Sets, txt and csv/fulltable_1agg.csv', header=T, as.is=T); # dim(fulltable_1agg); # head(fulltable_1agg); # summary(fulltable_1agg); levels(brand); levels(channel); levels(sex); levels(inc); levels(cordless); levels(cond); # Reassignment of variable type for use in glm. brand = as.factor(fulltable_1agg$branddesc); channel = as.factor(fulltable_1agg$saleschanneldesc); sex = as.factor(fulltable_1agg$genderdesc); inc = as.factor(fulltable_1agg$incrange); cordless = as.factor(fulltable_1agg$cordless); cond = as.factor(fulltable_1agg$salesconditiondesc); freq = fulltable_1agg$freq; levels(brand); brand = relevel(brand, ref='Bosch'); levels(channel); channel = relevel(channel, ref='Branded Web'); levels(sex); sex = relevel(sex, ref='Male'); levels(inc); inc = factor(inc, levels = c('High', 'Middle', 'Low')); inc = relevel(inc, ref='Middle'); levels(cordless); cordless = relevel(cordless, ref='Cordless'); levels(cond); cond = relevel(cond, ref='New'); # Display of raw data (ggplot code for the first one to check if model was specified correctly is under "Summary") Conditional_mu_sd_1brand = data.frame(cbind(tapply(freq, brand, mean),tapply(freq, brand, sd))); names(Conditional_mu_sd_1brand) = c('Mean', 'SD'); round(Conditional_mu_sd_1brand, 1); windows(); par(mfrow=c(1,2)); windows(40,20); barplot(table(freq),yaxp=c(0,160,8),ylim=c(1,160),ylab='',xpd=FALSE ,width=.5,xaxp=c(1,25000,100),xlab="Sales Count",space=0,col=4); plot(Conditional_mu_sd_1brand$Mean~Conditional_mu_sd_1brand$SD, col='blue', ylab='', xlim=c(0,1800), ylim=c(0,550)); Conditional_mu_sd_1 = data.frame(cbind(tapply(freq, channel, mean),tapply(freq, channel, sd))); names(Conditional_mu_sd_1) = c('Mean', 'SD'); round(Conditional_mu_sd_1, 1); windows(); par(mfrow=c(1,2)); barplot(table(freq), space=0, xlab="Sales Count", col=4); plot(Conditional_mu_sd_1$Mean~Conditional_mu_sd_1$SD, col='blue'); Conditional_mu_sd_1 = data.frame(cbind(tapply(freq, sex, mean),tapply(freq, sex, sd))); names(Conditional_mu_sd_1) = c('Mean', 'SD'); round(Conditional_mu_sd_1, 1); windows(); par(mfrow=c(1,2)); barplot(table(freq), space=0, xlab="Sales Count"); plot(Conditional_mu_sd_1$Mean~Conditional_mu_sd_1$SD, col='blue'); Conditional_mu_sd_1 = data.frame(cbind(tapply(freq, inc, mean),tapply(freq, inc, sd))); names(Conditional_mu_sd_1) = c('Mean', 'SD'); round(Conditional_mu_sd_1, 1); windows(); par(mfrow=c(1,2)); barplot(table(freq), space=0, xlab="Sales Count"); plot(Conditional_mu_sd_1$Mean~Conditional_mu_sd_1$SD, col='blue'); Conditional_mu_sd_1 = data.frame(cbind(tapply(freq, cordless, mean),tapply(freq, cordless, sd))); names(Conditional_mu_sd_1) = c('Mean', 'SD'); round(Conditional_mu_sd_1, 1); windows(); par(mfrow=c(1,2)); barplot(table(freq), space=0, xlab="Sales Count"); plot(Conditional_mu_sd_1$Mean~Conditional_mu_sd_1$SD, col='blue'); Conditional_mu_sd_1 = data.frame(cbind(tapply(freq, cond, mean),tapply(freq, cond, sd))); names(Conditional_mu_sd_1) = c('Mean', 'SD'); round(Conditional_mu_sd_1, 1); windows(); par(mfrow=c(1,2)); barplot(table(freq), space=0, xlab="Sales Count"); plot(Conditional_mu_sd_1$Mean~Conditional_mu_sd_1$SD, col='blue'); ### MODEL 1 ##POISSON MODELS # All 2-ways interactions. full_mod = glm(freq ~ brand + channel + ordered(inc) + sex + cordless + cond + brand*channel + brand*ordered(inc) + brand*sex + brand*cordless + brand*cond + channel*ordered(inc) + channel*sex + channel*cordless + channel*cond + ordered(inc)*sex + ordered(inc)*cordless + ordered(inc)*cond + sex*cordless + sex*cond + cordless*cond , family = poisson()); # Test for Over(Under)dispersion, dispersiontest(full_mod, trafo = 2, alternative = c("greater","two.sided","less")); # Estimate of overdispersed addiditive model. full_mod_disp = glm.poisson.disp(full_mod, maxit = 50, verbose = FALSE); # Forward & Backward model building:: http://stat.ethz.ch/R-manual/R-patched/library/MASS/html/stepAIC.html # These models include only significant interactions as determined by SAS results of LR Statistics For Type 3 Analysis. # Stepwise selection is done in R because the stepAIC function concisely compares AIC values for multiple models at once. step.1 = stepAIC(full_mod_disp, scope= list(upper = ~ brand*channel + brand*ordered(inc) + brand*sex + brand*cordless + brand*cond + channel*ordered(inc) + channel*sex + channel*cordless + channel*cond + ordered(inc)*sex + ordered(inc)*cordless + ordered(inc)*cond + sex*cordless + sex*cond + cordless*cond , lower = ~ 1), direction="both", trace=FALSE); anova(full_mod_disp, test='Chisq'); # Update: stepAIC and ANOVA suggests ~ - ordered(inc)*cordless full_mod_sw = glm(freq ~ brand*channel + brand*ordered(inc) + brand*sex + brand*cordless + brand*cond + channel*ordered(inc) + channel*sex + channel*cordless + channel*cond + ordered(inc)*sex + ordered(inc)*cond + sex*cordless + sex*cond + cordless*cond, family=poisson()); full_mod_disp_sw = glm.poisson.disp(full_mod_sw, maxit = 50, verbose = FALSE); anova(full_mod_disp_sw, test='Chisq'); ####### QUASI MODEL FITS BETTER as shown by ECDF below ######## full_mod_swquasiPoi = glm(freq ~ brand*channel + brand*ordered(inc) + brand*sex + brand*cordless + brand*cond + channel*ordered(inc) + channel*sex + channel*cordless + channel*cond + ordered(inc)*sex + ordered(inc)*cond + sex*cordless + sex*cond + cordless*cond, family=quasipoisson()); # Additive model used to get odds ratios for comparison. full_mod_add = glm(freq ~ brand + channel + ordered(inc) + sex + cordless + cond, family=poisson()); full_mod_dispadd = glm.poisson.disp(full_mod_add, maxit = 50, verbose = FALSE); # Model Fit # Dissimilarity Index D = sum(abs(freq - e)); D/(2*N); # Dissimilarity for glm.poisson.disp models. N1 = nrow(fulltable_1agg); dispoi_sw = sum(abs(resid(full_mod_disp_sw))) / (2*N1); dispoi_sw; dispoi_add = sum(abs(resid(full_mod_dispadd))) / (2*N1); dispoi_add; full_swquasiPoi = sum(abs(resid(full_mod_swquasiPoi))) / (2*N1); full_swquasiPoi; # AIC/BIC counts the scale estimation as a parameter in the edf, and extractAIC does not. They give equal AIC's. N1 = nrow(fulltable_1agg); extractAIC(full_mod_disp_sw,k=c(2,log(N1))); # The first value is edf, second is AIC, last is BIC. extractAIC(full_mod_dispadd,k=c(2,log(N1))); AIC(full_mod_disp_sw, full_mod_dispadd); # Additive has lower AIC. # Model fit check: Prediction Reliability. Checks to see how many estimates are > 2*SE. attributes(full_mod_dispadd); summary(full_mod_dispadd); C = abs(cbind(coefficients(full_mod_dispadd))); se = cbind(se.coef(full_mod_dispadd)); Dub = cbind(2*se); CDub1add = cbind(C[,1],Dub[,1],se[,1]); CDub1add = as.data.frame(CDub1add); names(CDub1add) = c('|Coef|', '2*SE', 'SE'); N = length(CDub1add[,2]); N; Greater = NULL; no = 0; yes = 0; for(itor in 1:N) { # Checks if the SE more than 2x greater than the predicted value. if(CDub1add[itor,1] >= CDub1add[itor,2]) { Greater[itor] = 'No'; no = no + 1; } else if(CDub1add[itor,1] < CDub1add[itor,2]) { Greater[itor] = 'Yes'; yes = yes + 1; } } prop.grtr1poiadd = yes/N; # Interaction Model C = abs(cbind(coefficients(full_mod_disp_sw))); C = as.data.frame(C); names(C) = 'cof'; C = C[!is.na(C$cof),]; se = cbind(se.coef(full_mod_disp_sw)); Dub = cbind(2*se); CDub1 = cbind(C,Dub[,1],se[,1]); CDub1 = as.data.frame(CDub1); names(CDub1) = c('|Coef|', '2*SE', 'SE'); CDub1 = CDub1[-c(2:30),]; # Removes main effects. N = length(CDub1[,2]); Greater = NULL; no = 0; yes = 0; for(itor in 1:N) { if(CDub1[itor,1] >= CDub1[itor,2]) { Greater[itor] = 'No'; no = no + 1; } else if(CDub1[itor,1] < CDub1[itor,2]) { Greater[itor] = 'Yes'; yes = yes + 1; } } prop.grtr1poisw = yes/N; prop.grtr1poisw; prop.grtr1poiadd; ## NEGATIVE BINOMIAL MODELS # All 2-Way Interactions full_mod_nb = glm.nb(freq ~ brand*channel + brand*ordered(inc) + brand*sex + brand*cordless + brand*cond + channel*ordered(inc) + channel*sex + channel*cordless + channel*cond + ordered(inc)*sex + ordered(inc)*cordless + ordered(inc)*cond + sex*cordless + sex*cond + cordless*cond); # stepAIC suggests ~ - ordered(inc):cordless step.2 = stepAIC(full_mod_nb, scope = list(upper = ~ brand*channel + brand*ordered(inc) + brand*sex + brand*cordless + brand*cond + channel*ordered(inc) + channel*sex + channel*cordless + channel*cond + ordered(inc)*sex + ordered(inc)*cordless + ordered(inc)*cond + sex*cordless + sex*cond + cordless*cond , lower = ~ 1), direction="both", trace=FALSE); # AIC tends to overfit. No additional terms are suggested to be dropped. dropterm(full_mod_nb_sw, test = 'Chisq'); # Update: stepAIC suggestion ~ - ordered(inc):cordless full_mod_nb_sw = glm.nb(freq ~ brand:channel + brand:ordered(inc) + brand:sex + brand:cordless + brand:cond + channel:ordered(inc) + channel:sex + channel:cordless + channel:cond + ordered(inc):sex + ordered(inc):cond + sex:cordless + sex:cond + cordless:cond); # Model comparison shows one is not better than the other, however, the latter is more parsimonious. anova(full_mod_nb, full_mod_nb_sw); c(theta = full_mod_nb_sw$theta, SE = full_mod_nb_sw$SE); # Suggests: ~ - sex:cordless - ordered(inc):cordless anova(full_mod_nb_sw, test='Chisq'); # Update: ANOVA suggestion ~ - sex:cordless - cordless:ordered(inc) full_mod_nb_anova = glm.nb(freq ~ brand:channel + brand:ordered(inc) + brand:sex + brand:cordless + brand:cond + channel:ordered(inc) + channel:sex + channel:cordless + channel:cond + ordered(inc):sex + ordered(inc):cond + sex:cond + cordless:cond); # Additive Model full_mod_nbadd = glm.nb(freq ~ brand + channel + sex + cordless + ordered(inc) + cond); # Model Fit # Which is better? AIC Values for glm.nb models. AIC(full_mod_nbadd); AIC(full_mod_nb_sw); AIC(full_mod_nb_anova); anova(full_mod_nbadd, full_mod_nb_sw, full_mod_nb_anova, test='Chisq'); # Dissimilarity Index for glm.nb models. N1 = nrow(fulltable_1agg); nb_sw = sum(abs(resid(full_mod_nb_sw))) / (2*N1); nb_sw; nb_anova = sum(abs(resid(full_mod_nb_anova))) / (2*N1); nb_anova; nb_add = sum(abs(resid(full_mod_nbadd))) / (2*N1); nb_add; # MODEL 1 SUMMARY mean(freq); var(freq); # Note: The Negative Binomial Interaction model was able to estimate 28 more interaction odds ratios than # the Poisson Interaction model because it reports estimates for the reference levels. # ANOVA summaries anova(full_mod_dispadd, full_mod_disp_sw, test='Chisq'); anova(full_mod_nbadd, full_mod_nb_anova, test='Chisq'); anova(full_mod_nb_anova,full_mod_nbadd, test='Chisq'); anova(full_mod_dispadd, full_mod_disp_sw, full_mod_nbadd, full_mod_nb_anova, full_mod_nb_sw, test='Chisq'); summary(full_mod_disp_sw); summary(full_mod_dispadd); summary(full_mod_nb_sw)#, correlation=T); summary(full_mod_nb_anova); summary(full_mod_nbadd,correlation=TRUE); # ## PLOTS windows(100,75); title = "Distribution of Coefficeints Estimated by glm.nb"; ML_Coefficients = full_mod_nb_sw$coef; gghist = ggplot(fulltable_1agg, aes(x=ML_Coefficients)) + xlab('Coefficients') + ylab("Count"); gghist + geom_histogram(colour="darkgreen", fill="lightgreen", binwidth = .1) + opts(title = title); #+scale_y_reverse() + coord_flip() windows(100,75); title = "Distribution of Coefficeints Estimated by glm.poisson.disp"; MM_Coefficients = full_mod_disp_sw$coef; gghist = ggplot(fulltable_1agg, aes(x=MM_Coefficients)) + xlab('Coefficients') + ylab("Count"); gghist + geom_histogram(colour="darkgreen", fill="lightgreen", binwidth = .1) + opts(title = title); # Histogram of observed frequencies. windows(100,75); title = "Distribution for Counts of Sales Frequency"; Frequency = freq; gghist = ggplot(fulltable_1agg, aes(x=Frequency)) + xlim(0,50000) + ylim(0,160) + ylab("Count"); gghist + geom_histogram(colour="darkgreen", fill="white", binwidth = .8) + opts(title = title); # Scatterplot of conditional mean and variances illustrating linearity for BRAND. windows(); title = "Linearity of Conditional Mean and Standard Deviation"; Conditional_Mean = Conditional_mu_sd_1brand$Mean; Conditional_SD = Conditional_mu_sd_1brand$SD; ggscat = ggplot(Conditional_mu_sd_1brand, aes(Conditional_Mean, Conditional_SD)) + xlim(0,4500) + ylim(0,4500); ggscat + geom_point(colour="darkgreen") + xlab("Conditional Mean") + ylab("Conditional SD"); # + opts(title = title); windows(); title = "Linearity of Conditional Mean and Standard Deviation"; Conditional_Mean = Conditional_mu_sd_1brand$Mean; Conditional_SD = Conditional_mu_sd_1brand$SD; ggscat = ggplot(Conditional_mu_sd_1brand, aes(Conditional_Mean, Conditional_SD)) + xlim(0,600) + ylim(0,2000); ggscat + geom_point(colour="darkgreen") + xlab("Conditional Mean") + ylab("Conditional SD"); # + opts(title = title); # ECDFs of models poiadd = fitted.values(full_mod_dispadd); disp_anova = fitted.values(full_mod_disp_sw); # Note: _sw and _anova are two-in-the-same. nbadd = fitted.values(full_mod_nbadd); nb_sw = fitted.values(full_mod_nb_sw); nb_swquasi = fitted.values(full_mod_swquasiPoi); ####### QUASI MODEL FITS BETTER ######## # Plot with all four model curves windows(); title = "Model Fit"; d.f = data.frame( Curve = as.factor(rep(c("Observed Frequency", "glm.nb","glm.poisson.disp", "glm.nb Interaction","glm.poisson.disp Interaction"), each=2490)), val = c(sample(freq,2490,replace=FALSE), sample(nbadd,2490,replace=FALSE), sample(poiadd,2490,replace=FALSE), sample(nb_sw,2490,replace=FALSE), sample(disp_anova,2490,replace=FALSE)) ); d.f = arrange(d.f,Curve,val); d.f.ecdf = ddply(d.f, .(Curve), transform, ecdf=ecdf(val)(val)); p = ggplot( d.f.ecdf, aes(val, ecdf, colour = Curve)) + xlim(500,5000) + ylim(.9,1) + scale_colour_hue(name="Model Curve", breaks=c("Observed Frequency","glm.nb","glm.poisson.disp","glm.nb Interaction","glm.poisson.disp Interaction")); p + geom_step() + xlab("Index") + ylab("Empirical CDF") + opts(legend.justification=c(.5,.475), legend.position=c(.754,.4975), legend.background = theme_rect(fill="grey90", size=.5), title=title); # Plot with interaction curves windows(); title = "Model Fit"; d.f = data.frame( Curve = as.factor(rep(c("Observed Frequency","glm.nb Interaction","glm.poisson.disp Interaction"), each=2490)), ####### QUASI MODEL FITS BETTER ######## val = c(sample(freq,2490,replace=FALSE), sample(nb_sw,2490,replace=FALSE), sample(disp_anova,2490,replace=FALSE)) ); d.f = arrange(d.f,Curve,val); d.f.ecdf = ddply(d.f, .(Curve), transform, ecdf=ecdf(val)(val)); p = ggplot( d.f.ecdf, aes(val, ecdf, colour = Curve)) + xlim(500,5000) + ylim(.9,1) + scale_colour_hue(name="Model Curve", breaks=c("Observed Frequency","glm.nb Interaction","glm.poisson.disp Interaction")); ####### QUASI MODEL FITS BETTER ######## p + geom_step() + xlab("Index") + ylab("Empirical CDF") + opts(legend.justification=c(.5,.5), legend.position=c(.754,.50625), legend.background = theme_rect(fill="grey90", size=.5), title=title); # Curve Plot of observed and expected model estimates val = data.frame(freq,nb_sw,disp_anova,nbadd); valdf = stack(val, c(freq,nb_sw,disp_anova,nbadd)); val = valdf$val index = cbind(as.factor(rep(c('1':'2490','1':'2490','1':'2490','1':'2490'), each=1))); d.f = data.frame( val,index, Curve = as.factor(rep(c("Observed Frequency","glm.nb Interaction","glm.poisson.disp Interaction","glm.nb"), each=2490)) ) windows(); title = "Observed and Expected Sales Frequencies"; p = ggplot(d.f, aes(x=index, y=val, colour=Curve, group=Curve)) p + geom_line() + xlim(2300,2495) + ylim(0,10000) + xlab("Index") + ylab("Sales Volume") + scale_colour_brewer(palette="Set1") + scale_colour_hue(l=60, c=110, name="Model Curve", breaks=c("Observed Frequency","glm.nb Interaction","glm.poisson.disp Interaction","glm.nb")) + opts(legend.justification=c(.68,.235), legend.position=c(.4025,.65), legend.background = theme_rect(fill="grey90", size=.5), title=title); val = data.frame(freq,nb_sw); valdf = stack(val, c(freq,nb_sw)); val = valdf$val index = cbind(as.factor(rep(c('1':'2490','1':'2490'), each=1))); d.f = data.frame( val,index, Curve = as.factor(rep(c("Observed Frequency","glm.nb Interaction"), each=2490)) ) windows(); title = "Observed and Expected Sales Frequencies"; p = ggplot(d.f, aes(x=index, y=val, colour=Curve, group=Curve)) + xlim(2300,2489) + ylim(0,10000) + xlab("Index") + ylab("Sales Volume") p + geom_line() + scale_colour_brewer(palette="Dark2") + scale_colour_hue(l=60, c=110, name="Model Curve", breaks=c("Observed Frequency","glm.nb Interaction")) + opts(legend.justification=c(.6775,.235), legend.position=c(.40125,.653), legend.background = theme_rect(fill="grey90", size=.5), title=title); # Histogram of Deviance residuals for glm.poisson.disp interaction model (Normally distributed). windows(); title = "glm.poisson.disp Residuals"; Deviance_Residuals = cbind(residuals(full_mod_disp_sw, type='deviance')); deviance_residuals = data.frame(cbind(residuals(full_mod_disp_sw, type='deviance'))); gghist = ggplot(deviance_residuals, aes(x=Deviance_Residuals)) + xlab('Deviance Residuals') + ylab("Count"); gghist + geom_histogram(colour="darkgreen", fill="lightgreen", binwidth = .1) + opts(title = title); windows(); title = "glm.nb Residuals"; Deviance_Residuals = cbind(residuals(full_mod_nb_sw, type='deviance')); deviance_residuals = data.frame(cbind(residuals(full_mod_nb_sw, type='deviance'))); gghist = ggplot(deviance_residuals, aes(x=Deviance_Residuals)) + xlab('Deviance Residuals') + ylab("Count"); gghist + geom_histogram(colour="darkgreen", fill="lightgreen", binwidth = .1) + opts(title = title); # Model Residuals windows(); plot(full_mod_disp_sw, lty=1, col = 4, with=1:6); plot(full_mod_nb_sw, lty=1, col = 4, with=1:6); # Cook's Distances for glm.nb windows(); Cook1 = cbind(cooks.distance(full_mod_nb_sw)); plot(Cook1,col=4,lty=1,ylab="Cook's Distance", main="Cook's Distances for glm.nb"); # Cook's Distances for glm.poisson.disp windows(); Cook2 = cbind(cooks.distance(full_mod_disp_sw)); plot(Cook2,col=4,lty=1,ylab="Cook's Distance", main="Cook's Distances for glm.poisson.disp"); # DF Beta Plots for all variable levels # influence.measures(full_mod_nb_sw) windows(); dfbetaPlots(full_mod_nb_sw,grid=TRUE,labels=rownames(dfbeta),col=4); windows(); dfbetaPlots(full_mod_disp_sw,grid=TRUE,labels=rownames(dfbeta),col=4); # Moment Estimated Stepwise Model: Individual predictors # BRAND # Deviance Residuals windows(); par(mar=c(8,6,4,2), mgp = c(4,1,0)); plot(brand, residuals(full_mod_disp_sw, type='deviance'), col=rainbow(25), xant="n",xlab="BRAND", axes=F, ann=T, ylab='Deviance Residuals'); axis(1, at=1:21, labels=FALSE); axis(2); box(); labels = paste(levels(brand), 1:21, sep=" "); text(1:21, par("usr")[3]-0.275, srt=45, adj=1, labels=levels(brand), xpd=TRUE, cex=.8); # CHANNEL # Deviance Residuals windows(); par(mar=c(8,6,4,2), mgp = c(3,1,0)); plot(channel, residuals(full_mod_disp_sw, type='deviance'), col=rainbow(6), xant="n",xlab="", axes=F, ann=T, ylab='Deviance Residuals'); axis(1,labels=FALSE); axis(2); box(); labels = paste(levels(channel), 1:5, sep=" "); text(1:5, par("usr")[3]-0.5, srt=45, adj=1, labels = levels(channel), xpd=TRUE); mtext(1,text='CHANNEL', line=6) # SEX # Deviance Residuals windows(); plot(sex, residuals(full_mod_disp_sw, type='deviance'), xlab='SEX', ylab='Deviance Residual', col=rainbow(6)); # INCOME # Deviance Residuals windows(); plot(inc, residuals(full_mod_disp_sw, type='deviance'), xlab='INCOME', ylab='Deviance Residual', col=rainbow(6)); # CORDLESS # Deviance Residuals windows(); plot(cordless, residuals(full_mod_disp_sw, type='deviance'), xlab='CORDLESS', ylab='Deviance Residual', col=c(4,7)); # CONDITION # Deviance Residuals windows(); plot(cond, residuals(full_mod_disp_sw, type='deviance'), xlab='CONDITION', ylab='Deviance Residual', col=c(4,7)); # NB Stepwise Model: Individual predictors # BRAND # Deviance Residuals windows(); par(mar=c(8,6,4,2), mgp = c(4,1,0)); plot(brand, residuals(full_mod_nb_sw, type='deviance'), col=rainbow(25), xant="n",xlab="BRAND", axes=F, ann=T, ylab='Deviance Residuals'); axis(1, at=1:21, labels=FALSE); axis(2); box(); labels = paste(levels(brand), 1:21, sep=" "); text(1:21, par("usr")[3]-0.275, srt=45, adj=1, labels=levels(brand), xpd=TRUE, cex=.8); # CHANNEL # Deviance Residuals windows(); par(mar=c(8,6,4,2), mgp = c(3,1,0)); plot(channel, residuals(full_mod_nb_sw, type='deviance'), col=rainbow(6), xant="n",xlab="", axes=F, ann=T, ylab='Deviance Residuals'); axis(1,labels=FALSE); axis(2); box(); labels = paste(levels(channel), 1:5, sep=" "); text(1:5, par("usr")[3]-0.5, srt=45, adj=1, labels = levels(channel), xpd=TRUE); mtext(1,text='CHANNEL', line=6) # SEX # Deviance Residuals windows(); plot(sex, residuals(full_mod_nb_sw, type='deviance'), xlab='SEX', ylab='Deviance Residual', col=rainbow(6)); # INCOME # Deviance Residuals windows(); plot(inc, residuals(full_mod_nb_sw, type='deviance'), xlab='INCOME', ylab='Deviance Residual', col=rainbow(6)); # CORDLESS # Deviance Residuals windows(); plot(cordless, residuals(full_mod_nb_sw, type='deviance'), xlab='CORDLESS', ylab='Deviance Residual', col=c(4,7)); # CONDITION # Deviance Residuals windows(); plot(cond, residuals(full_mod_nb_sw, type='deviance'), xlab='CONDITION', ylab='Deviance Residual', col=c(4,7)); # Model Fit # Excessive, unaccounted for variability in the data if mu is greater than mu+a*mu^2 Observed = var(freq); f3 = full_mod_nbadd$theta; m = mean(fulltable_1agg$freq); Additive = m+ f3*(m)^2; f4 = full_mod_nb_sw$theta; m = mean(fulltable_1agg$freq); Interaction = m+ f4*(m)^2; Overshoot = as.data.frame(rbind(Observed,Additive,Interaction)); names(Overshoot) = 'Variance'; Overshoot; #It's good # The NB does not adjust enough if the dispersion statistic, theta, if <1 Additive_theta = full_mod_nbadd$theta; Interaction_theta = full_mod_nb_sw$theta; Correction = as.data.frame(rbind(Additive_theta,Interaction_theta)); names(Correction) = 'theta'; Correction; #It's good # Are less than 20% of the expected values are below 5? Yes. fits = cbind(fitted(full_mod_nb_sw)); fits = as.vector(cbind(fits[order(fits)])); N = length(freq); i=0; small_expctd=0; for(i in 1:N) { if (fits[i]<5) { i = i+1; small_expctd = small_expctd + 1; } else{} pct_below_5 = small_expctd/N; } small_expctd; N; pct_below_5; # A likelihood ratio (LR) test can be used to test the null hypothesis that the restriction implicit in the # Poisson model is true. H0: theta = 1 v. H1: theta ne 1. "odTestw" is for NB, and "dispersiontest" is for Poi. odTest(full_mod_nbadd); odTest(full_mod_nb_sw); dispersiontest(full_mod_disp_sw, trafo = 2, alternative = c("greater","two.sided","less")); # AIC AIC(full_mod_dispadd, full_mod_disp_sw, full_mod_nb_sw, full_mod_nb_anova, full_mod_nbadd); # Dissimilarity Dissim_1 = as.data.frame(rbind(dispoi_sw, dispoi_add, nb_sw1, nb_add1)); names(Dissim_1)= c('Prop Off'); Dissim_1; # (%) Proportion of poorly predicted coefficients for all four models Poor_1 = as.data.frame(rbind(prop.grtr1poisw, prop.grtr1poiadd, prop.grtrnb1_sw, prop.grtr1_nbadd)); names(Poor_1) = '%(2*SE>|Coeff|)'; Poor_1; # Deviance Value/df: Measure of fit--dispersion nb1_nbadd_gof = full_mod_nbadd$deviance/full_mod_nbadd$df.residual; disp1_dispsw_gof = full_mod_disp_sw$deviance/full_mod_disp_sw$df.residual; nb1_sw_gof = full_mod_nb_sw$deviance/full_mod_nb_sw$df.residual; disp1_add_gof = full_mod_dispadd$deviance/full_mod_dispadd$df.residual; lof_1 = as.data.frame(rbind(nb1_nbadd_gof, disp1_dispsw_gof, nb1_sw_gof, disp1_add_gof)); names(lof_1) = 'Value/df'; # Estimated Dispersion Parameters: full_dispadd = full_mod_dispadd$dispersion; full_nbadd = 1/full_mod_nbadd$theta; full_dispsw = full_mod_disp_sw$dispersion; full_nbsw = 1/full_mod_nb_sw$theta; dispersion_parameters = as.data.frame(rbind(full_nbadd, full_dispsw, full_nbsw, full_dispadd)); names(dispersion_parameters) = 'phi=1/theta'; # Dispersion Parameters and LOF values dispersion_lof = as.data.frame(cbind(dispersion_parameters, lof_1)); dispersion_lof; quasi_poi = 1/21.58384; # OR logistic.display(full_mod_dispadd); logistic.display(full_mod_nb_sw);