# Title: # Addressing Statistical Assumptions using R: # You Know What Happens When You "Assume" # Author: Danney Rasco # Affiliation: # Department of Psychology, Sociology, and Social Work # West Texas A&M University #Install and attach packages---- PacMan <- function(pkg){ new.pkg <- pkg[!(pkg %in% installed.packages()[, "Package"])] if (length(new.pkg)) install.packages(new.pkg, dependencies = TRUE) sapply(pkg, library, character.only = TRUE)} pkgs <- c("ggplot2", "scatterplot3d", "rgl", "car", "MVN", "dplyr", "tidyr", "DescTools", "nortest", "olsrr", "ez", "apaTables") PacMan(pkgs) # #Set working directory if needed setwd("C:/Users/.../Desktop/FolderForProject") # #Import data---- AssumeDF <- read.csv("gradeData_2020_v3.csv", header=TRUE, na.strings=c("NA", " ", "")) names(AssumeDF) View(AssumeDF) str(AssumeDF) AssumeDF$Part_ID <- factor(AssumeDF$Part_ID) str(AssumeDF) # #Assumption testing---- ##Visually Explore Data and Check Assumptions---- ###Normality and Univariate Outliers---- #Histogram of SAT by School Type library(ggplot2) ggplot(data=AssumeDF, aes(x=FinalExam)) + geom_histogram(position="identity", alpha=0.8, binwidth = 10) + theme_classic() + facet_wrap(~schoolType) # #Boxplot: Final Exam by School Type with outliers #coef adjusts cutoff for outliers ggplot(data=AssumeDF, aes(x=schoolType, y=FinalExam, color=schoolType)) + geom_boxplot(width=.4, outlier.color = "red", coef=1.5) + labs(title="Final Exam Boxplot", x="School Type", y="Final Exam") + theme_classic() + theme(legend.position="none") + scale_color_brewer(palette="Dark2") # #Q-Q Plot by Group ggplot(data=AssumeDF, aes(sample=FinalExam, color=schoolType)) + stat_qq() + stat_qq_line() # #Visually check linearity and homoscedasticity---- ##Bivariate Scatterplot ggplot(data=AssumeDF, aes(x=SAT, y=FinalExam))+ geom_point()+ geom_smooth(method="lm", se=F) # ggplot(data=AssumeDF, aes(x=minStudy, y=FinalExam))+ geom_jitter(size=2, shape=1)+ geom_smooth(method="lm", se=F) # ##Visualizing Three or Four Variables---- ###Stationary 3D plot library(scatterplot3d) ScatEXAM_DF <- scatterplot3d( AssumeDF$FinalExam ~ AssumeDF$SAT + AssumeDF$minStudy, main="Final Exam \n by SAT and Minutes Studied", xlab="SAT", ylab="Min. Studied", zlab="Final Exam") ScatPlane <- lm(FinalExam~SAT+minStudy, data=AssumeDF) ScatEXAM_DF$plane3d(ScatPlane) # ###Interactive 3D plot library(rgl) plot3d(FinalExam~SAT+minStudy, data=AssumeDF, xlab="SAT", ylab="Min. Studied", zlab="Final Exam") # ###Scatterplot with Grouping Var. library(car) scatter3d(FinalExam~SAT+minStudy| schoolType, data=AssumeDF, xlab="SAT", ylab="Final Exam", zlab="minStudy") # #Quantitative Evaluate Assumptions---- ##Univariate and Multivariate Normality---- #Create data frame with variables of interest VarsInt <- subset.data.frame(AssumeDF, select=c(SAT, minStudy, FinalExam)) View(VarsInt) # library(MVN) MultNorm <- mvn(VarsInt, mvnTest = "royston", univariateTest = "Lillie", multivariatePlot = "qq", multivariateOutlierMethod = "quan", showOutliers=T, showNewData = T) View(MultNorm$multivariateNormality) View(MultNorm$univariateNormality) View(MultNorm$Descriptives) View(MultNorm$newData) # #Create Data Frame without Multivariate Outliers OutlierRemove_DF <- data.frame(MultNorm$newData) View(OutlierRemove_DF) # #Plot multivariate normality by group VarsInt_wGrp <- subset.data.frame(AssumeDF, select=c(schoolType, SAT, minStudy, FinalExam)) MultNormByGrp <- mvn(VarsInt_wGrp, subset="schoolType", mvnTest = "royston", multivariatePlot = "qq") View(MultNormByGrp$multivariateNormality$private) View(MultNormByGrp$univariateNormality$private) # #Descriptive Statistics View(MultNormByGrp$Descriptives$`public`) # ##Univariate Normality by Group---- #Skewness and Kurtosis library(dplyr) library(tidyr) library(DescTools) # SumTab <- AssumeDF %>% summarize_at(vars(SAT, minStudy, FinalExam), c(M=mean, SD=sd, Mdn=median, skew=Skew, kurt=Kurt))%>% gather(stat, value) %>% separate(stat, into = c("Var", "stat")) %>% spread(stat, value) %>% mutate_if(is.numeric, round, 3) %>% mutate_if(is.numeric, format, nsmall=3) %>% select_at(vars(Var, M, SD, Mdn, skew, kurt)) View(SumTab) # #Kolmogorov-Smirnov (KS) with Lilliefors correction library(dplyr) library(nortest) # LillTestObj <- lillie.test(AssumeDF$FinalExam) View(LillTestObj) LillTestObj$statistic LillTestObj$p.value #KS with Lilliefors correction by group LillieBySchool <- AssumeDF %>% group_by(schoolType) %>% summarize( Lillie_D=lillie.test(FinalExam)$statistic, Lillie_pVal=lillie.test(FinalExam)$p.value) %>% mutate_if(is.numeric, round, 3) View(LillieBySchool) # #Shapiro-Wilk by Group ShapiroBySchool <- AssumeDF %>% group_by(schoolType) %>% summarize( Shapiro_W=shapiro.test(FinalExam)$statistic, Shapiro_pVal=shapiro.test(FinalExam)$p.value) %>% mutate_if(is.numeric, round, 3) View(ShapiroBySchool) # ##Homogeneity of Variance---- library(car) leveneTest(FinalExam~schoolType, data=AssumeDF) # ##Evaluate Residuals---- ###Run Linear Model MultReg <- lm( formula = FinalExam ~ SAT + minStudy, data = AssumeDF) # ###Add predicted and residual values to data frame AssumeDF$FinExamPred <- predict(MultReg) AssumeDF$FinExamResid <- residuals(MultReg) View(AssumeDF) # ###Create histogram of residuals ggplot(data=AssumeDF, aes(x=FinExamResid))+ geom_histogram(position="identity", alpha=0.8, binwidth=5, col="white")+ labs(title="Residuals Histogram")+ theme_classic() # ###Check the mean of the residuals is 0 summary(AssumeDF$FinExamResid) # ####Plot Predicted by Residual ggplot(data=AssumeDF, aes(x=FinExamPred, y=FinExamResid))+ geom_point()+ geom_smooth(method="lm", se=F) # ##Check Collinearity---- library(olsrr) CollTest <- ols_coll_diag(MultReg) View(CollTest$vif_t %>% mutate_if(is.numeric, round, 3)) # #ANOVA---- library(ez) ezEXAMaov <- ezANOVA(data=AssumeDF, dv=FinalExam, wid=Part_ID, between=schoolType, type=3, return_aov=T, detailed=F) ezEXAMaov PostAOV <- TukeyHSD(ezEXAMaov$aov) View(round(PostAOV$schoolType, 3)) #Addressing Violations---- ##ANOVA Alternatives---- ###Kruskal-Wallis Test kruskal.test(FinalExam~schoolType, data=AssumeDF) pairWilcox <- pairwise.wilcox.test( x=AssumeDF$FinalExam, g=AssumeDF$schoolType, p.adj = "bonferroni", paired=F) View(round(pairWilcox$p.value, 4)) # ###Bootstrapped ANOVA results library(ez) bootAOV <- ezBoot(data=AssumeDF, wid=Part_ID, dv=FinalExam, between=schoolType, iterations=500, resample_within=FALSE) ezPlot2(pred=bootAOV, x=schoolType) boot_DF <- data.frame(bootAOV$boots) View(boot_DF) BootTab <- boot_DF %>% group_by(schoolType) %>% summarize(N=n(),Mean=mean(value), LL= round(quantile( value, prob=.025), 3), UL= round(quantile( value, prob=.975), 3)) View(BootTab) # #Regression---- MultReg <- lm( formula = FinalExam ~ SAT + minStudy, data = AssumeDF) summary.lm(MultReg) ##Regression Alternatives---- ###Bootstrap confidence intervals set.seed(42)#For reproducibility library(apaTables) apa.reg.boot.table(MultReg, filename="BootMultReg5000.doc", table.number=1002, number.samples = 5000)