diff --git a/FinalProj.Rmd b/FinalProj.Rmd index f803b2529965f71ba10482afd5e859f588f1019f..46ab234cd9d99c12174e927beeb022b90fbb30af 100644 --- a/FinalProj.Rmd +++ b/FinalProj.Rmd @@ -1,6 +1,6 @@ --- title: "finalproj403" -author: "irisqlin" +author: "Iris Lin and Hannah Zhou" date: "11/13/2021" output: html_document --- @@ -184,15 +184,18 @@ data$N.safe <- as.numeric(data$N.safe) ```{r} # response data exploration -# hist(as.numeric(data$X30.5drinks)) hist(as.numeric(data$X30.cig)) hist(as.numeric(data$X30drink)) hist(as.numeric(data$X30marijuana)) summary(data) -# TODO: as numeric all the predictor variables -round(cor(data[, -c(1, 2, 3, 4, 6)], 3)) +cor_data <- data[, -c(1, 2, 3, 4, 6, 7, 8, 9, 10, 11, 14)] +res <- cor(cor_data) +round(res, 2) +cor(data[, -c(1, 2, 3, 4, 6, 7, 8, 14)], 3) + +(diag(var(data[, -c(1, 2, 3, 4, 6, 7, 8, 14)]))) ``` @@ -200,7 +203,7 @@ round(cor(data[, -c(1, 2, 3, 4, 6)], 3)) Using the wald-test, liklihood ratio test, and the drop in deviance tests, we prefer drinks_mod_2. ```{r} -# making models +# making models, 1 = more likely to abuse substance, 0 = unlikely to abuse substance drinks_threshold <- mean(data$X30drink) data$drinks_var <- ifelse(data$X30drink >= drinks_threshold, 1, 0) @@ -215,6 +218,7 @@ summary(drinks_mod_1) # dropped p > 0.1, drinks_mod_2 <- glm(drinks_var ~ truth + decision + excite + safe + best.school + Wpdrink, data = data, family = "binomial") summary(drinks_mod_2) +exp(coef(drinks_mod_2)) #likelihood ratio test to test whether the observed difference in model fits is statistically significant # source: https://www.listendata.com/2016/07/insignificant-levels-of-categorical-variable.html @@ -231,11 +235,10 @@ anova(drinks_mod_2, drinks_mod_1, test="Chisq") # source: https://bookdown.org/roback/bookdown-BeyondMLR/ch-poissonreg.html#cs-philippines # source: https://bookdown.org/roback/bookdown-BeyondMLR/ch-logreg.html ``` - ## Cig model ```{r} -# making models +# making models, 1 = abusing substance, 0 = not abusing substance cig_threshold <- mean(data$X30.cig) data$cig_var <- ifelse(data$X30.cig >= cig_threshold, 1, 0) @@ -256,19 +259,25 @@ summary(cig_mod_3) #likelihood ratio test to test whether the observed difference in model fits is statistically significant # source: https://www.listendata.com/2016/07/insignificant-levels-of-categorical-variable.html - anova(cig_total_mod, cig_mod_1, test="LRT") +# not sig, prefer smaller model mod 1 anova(cig_mod_1, cig_mod_2, test = "LRT") +# sig, may prefer the larger model, mod 1 anova(cig_mod_2, cig_mod_3, test="LRT") -#Both of these are not significant, which means dropping the variables we did was not significant. +# not sig, prefer smaller mod 3 +anova(cig_mod_1, cig_mod_3, test="LRT") +# sig, prefer the larger mod, mod 1 -# TODO: idk what this does -anova(drinks_total_mod, drinks_mod_1, test="Chisq") +# Drop in deviance test +anova(cig_mod_1, cig_total_mod, test="Chisq") +# not sig difference +anova(cig_mod_1, cig_mod_2, test = "Chisq") +# sig, may prefer the smaller model, mod 2 +anova(cig_mod_3, cig_mod_2, test="Chisq") +# not sig, essentially no difference in residual deviance values +anova(cig_mod_3, cig_mod_1, test="Chisq") +# The difference in deviance is significantly significant, the drop in dev test prefers the smaller model, mod 3 -#drop in deviance test compares residual deivances from two models -# source: https://bookdown.org/roback/bookdown-BeyondMLR/ch-poissonreg.html#cs-philippines -# source: https://bookdown.org/roback/bookdown-BeyondMLR/ch-logreg.html -anova(drinks_mod_1, drinks_mod_2, test = "Chisq") ```