#Set seed so your answers are all the same
set.seed(9)
# Sample Per class room people
n1 <- 20; n2 <- 20; n3 <- 20; n4 <- 20
N<-n1+n2+n3+n4 # Total N
# Uniform distrobution of proportion of time per classroom
X1 <- runif(n1, 0, .35)
X2 <- runif(n2, .3, .55)
X3 <- runif(n3, .5, .75)
X4 <- runif(n4, .7,1.0)
# noise per classroom
e1 <- rnorm(n1, 0, sd=2.5)
e2 <- rnorm(n2, 0, sd=2.5)
e3 <- rnorm(n3, 0, sd=2.5)
e4 <- rnorm(n4, 0, sd=2.5)
# Intercepts per classroom
B0.1 <- 80
B0.2 <- 70
B0.3 <- 60
B0.4 <- 50
# Same slope per classroom
B1=10
# Our equation to create Y for each classroom
Y1 = B1*scale(X1,scale=F) + B0.1 + e1
Y2 = B1*scale(X2,scale=F) + B0.2 + e2
Y3 = B1*scale(X3,scale=F) + B0.3 + e3
Y4 = B1*scale(X4,scale=F) + B0.4 + e4
# Merge classrooms into 1 data.frame
Math.Data<-data.frame(Math=c(Y1,Y2,Y3,Y4),ActiveTime=c(X1,X2,X3,X4),
Classroom=c(rep("C1",n1),rep("C2",n2),rep("C3",n3),rep("C4",n4)),
StudentID=as.factor(1:N))
library(ggplot2)
theme_set(theme_bw())
ClassRoom.Plot.1 <-ggplot(data = Math.Data,
aes(x = ActiveTime, y=Math))+ #scaffold
geom_point()+ # add layer of scatterplot
geom_smooth(method = "lm", se = TRUE)+ # add regression line
xlab("Proportion of Time Engaged in Active Learning")+
ylab("Math Score") # add labels
ClassRoom.Plot.1 #call plot
library(stargazer)
Class.All<-lm(Math~ActiveTime, data = Math.Data)
stargazer(Class.All,type="latex",
intercept.bottom = FALSE, single.row=TRUE,
star.cutoffs=c(.05,.01,.001), notes.append = FALSE,
header=FALSE)
ClassRoom.Plot.2 <-ggplot(data = Math.Data,
aes(x = ActiveTime, y=Math))+
geom_point(aes(colour = Classroom, shape=Classroom))+ # we add color by cluster
geom_smooth(method = "lm", se = TRUE)+
xlab("Proportion of Time Engaged in Active Learning")+
ylab("Math Score")+ # add labels
theme(legend.position = "top")
ClassRoom.Plot.2
ClassRoom.Plot <-ggplot(data = Math.Data,
aes(x = ActiveTime, y=Math))+
geom_point(aes(colour = Classroom, shape=Classroom))+
geom_smooth(method = "lm", se = TRUE, aes(group = Classroom))+# we add group level
xlab("Proportion of Time Engaged in Active Learning")+ylab("Math Score")+ # add labels
theme(legend.position = "top")
ClassRoom.Plot
Class1<-lm(Math~ActiveTime, data = subset(Math.Data,Classroom=="C1"))
Class2<-lm(Math~ActiveTime, data = subset(Math.Data,Classroom=="C2"))
Class3<-lm(Math~ActiveTime, data = subset(Math.Data,Classroom=="C3"))
Class4<-lm(Math~ActiveTime, data = subset(Math.Data,Classroom=="C4"))
stargazer(Class1,Class2,Class3,Class4,type="latex",
intercept.bottom = FALSE, single.row=TRUE,
star.cutoffs=c(.05,.01,.001), notes.append = FALSE,
header=FALSE)
intercepts.Math.Data<-unname(c(Class1$coef[1],Class2$coef[1],
Class3$coef[1],Class4$coef[1]))
M.Intercept<-mean(intercepts.Math.Data)
SD.Intercept<-sd(intercepts.Math.Data)
library(apa)
t.intercept.Math.Data<-t_apa(t.test(intercepts.Math.Data), print=FALSE)
Slope.Math.Data<-unname(c(Class1$coef[2],Class2$coef[2],
Class3$coef[2],Class4$coef[2]))
M.Slope<-mean(Slope.Math.Data)
SD.Slopet<-sd(Slope.Math.Data)
t.Slope.Math.Data<-t_apa(t.test(Slope.Math.Data), print=FALSE)
The slope mean = 12.34 with a SD = 1.97, and the one sample t-test, t(3) = 12.51, p = .001, d = 6.25
\(R^2\) is alittle harder as you cannot simply take a mean and SD of the r values. You must do a Fisher’s r’-to-z \[ r.to.z' = \frac{1}{2}[log_e(1+r) - log_e(1-r)] = archtanh(r) \] and back again
\[ z'.to.r = tanh(z') \]
#Functions for fishers transforms
FisherRtoZ <-function(r) {.5*((log(1+r)-log(1-r)))}
FisherZtoR <-function(z) {tanh(z)}
R2.Math.Data<-FisherRtoZ(c(summary(Class1)$r.squared^.5,summary(Class2)$r.squared^.5,
summary(Class3)$r.squared^.5,summary(Class4)$r.squared^.5))
M.R2<-FisherZtoR(mean(R2.Math.Data))^2
SD.R2<-FisherZtoR(sd(R2.Math.Data))^2
t.R2.Math.Data<-t_apa(t.test(R2.Math.Data), print=FALSE)
In all cases the \(R^2\), slope, and intercept were significantly different from zero, but is this analysis make sense? What have we lost? Well, we lost the SE from regression models. This is an anti-conservative approach (you dumped all the variance from the regressions)
### Rescale DV
library(plyr)
# Create a new variable in the data which is the mean of Math Score per class room (at each subject)
Math.Data<-ddply(Math.Data,.(Classroom), mutate, ClassMeanMath = mean(Math))
# next we center relative to each student relative to their class Math mean (how much to they differ from their cohort)
Math.Data$Math.Class.Centered<-Math.Data$Math-Math.Data$ClassMeanMath
ClassRoom.Plot.3 <-ggplot(data = Math.Data, aes(x = ActiveTime, y=Math.Class.Centered))+
geom_point(aes(colour = Classroom, shape=Classroom))+
geom_smooth(method = "lm", se = TRUE)+# we add group level
xlab("Proportion of Time Engaged in Active Learning")+
ylab("Math Score (Classroom Centered)")+
theme(legend.position = "top")
ClassRoom.Plot.3
Class.All.C<-lm(Math.Class.Centered~ActiveTime, data = Math.Data)
stargazer(Class.All.C,type="latex",
intercept.bottom = FALSE, single.row=TRUE,
star.cutoffs=c(.05,.01,.001), notes.append = FALSE,
header=FALSE)
ClassRoom.Plot.4 <-ggplot(data = Math.Data,
aes(x = ActiveTime, y=Math.Class.Centered))+
geom_point(aes(colour = Classroom, shape=Classroom))+
geom_smooth(method = "lm", se = TRUE)+# we add group level
geom_smooth(method = "lm", se = TRUE, aes(group = Classroom, color=Classroom))+
xlab("Proportion of Time Engaged in Active Learning")+
ylab("Math Score (Classroom Centered)")+ # add labels
theme(legend.position = "top")
ClassRoom.Plot.4
### Rescale IV
Math.Data<-ddply(Math.Data,.(Classroom), mutate, ClassActiveTime = mean(ActiveTime))
Math.Data$ActiveTime.Class.Centered<-Math.Data$ActiveTime-Math.Data$ClassActiveTime
ClassRoom.Plot.5 <-ggplot(data = Math.Data,
aes(x = ActiveTime.Class.Centered, y=Math.Class.Centered))+
geom_point(aes(colour = Classroom, shape=Classroom))+
geom_smooth(method = "lm", se = TRUE)+# we add group level
xlab("Proportion of Time Engaged in Active Learning\n(Centered per Classroom)")+
ylab("Math Score (Classroom Centered")+ # add labels
theme(legend.position = "top")
ClassRoom.Plot.5
Class.All.C2<-lm(Math.Class.Centered~ActiveTime.Class.Centered, data = Math.Data)
stargazer(Class.All.C2,type="latex",
intercept.bottom = FALSE, single.row=TRUE,
star.cutoffs=c(.05,.01,.001), notes.append = FALSE,
header=FALSE)
Well now what do have? Well, our slope makes sense, our degrees of freedom are bigger (but wrong), but our intercept is always going to be zero! We lost our intercept.
\[ Y_i = \beta_1X_1 +\beta_0 + \epsilon\]
Within each Group/Cluster (Students = \(i\) within Classroom 1 = \(j\), and we would replace \(j\) with \(g\) for Classroom 2 and so forth for each classroom!) \[y_{ij} = B_{1j}X_{ij} + B_{0j} + r_{ij} \]
Where \(B_{0j}\) = intercept in group j
Where \(B_{ij}\) = slope of students in group j
Where \(r_{ij}\) = residuals of each i (student) within group j
\[B_{0j} = \gamma_{00}+u_{0j} \]
\[B_{1j} = \gamma_{10}+u_{1j} \]
The degree of clustering is measured via the inter-class correlation ICC
The proportion of total variance that is between the groups \[ ICC = \frac{\tau}{\tau + \sigma^2}\]
\(\tau\) = variance in a variable due to differences between a group
\(\sigma^2\) = total variance across groups
We would fit this on the null model (no level 1 predictors)
library(lme4) #mixed model package by Douglas Bates et al
Model.Null<-lmer(Math ~1+(1|Classroom),
data=Math.Data, REML=FALSE)
summary(Model.Null)
## Linear mixed model fit by maximum likelihood . t-tests use Satterthwaite's
## method [lmerModLmerTest]
## Formula: Math ~ 1 + (1 | Classroom)
## Data: Math.Data
##
## AIC BIC logLik deviance df.resid
## 408.6 415.7 -201.3 402.6 77
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.34755 -0.72263 0.01011 0.73079 2.21975
##
## Random effects:
## Groups Name Variance Std.Dev.
## Classroom (Intercept) 126.884 11.264
## Residual 6.668 2.582
## Number of obs: 80, groups: Classroom, 4
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) 64.70 5.64 4.00 11.47 0.000329 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
ranef(Model.Null)
## $Classroom
## (Intercept)
## C1 14.959676
## C2 5.307215
## C3 -5.190059
## C4 -15.076832
##
## with conditional variances for "Classroom"
library(performance)
ICC.Null<-icc(Model.Null)
ICC.Null
## # Intraclass Correlation Coefficient
##
## Adjusted ICC: 0.950
## Unadjusted ICC: 0.950
library(sjPlot)
library(glmmTMB)
plot_model(Model.Null, type ='re',
facet.grid = FALSE,
sort.est = "sort.all",
y.offset = .4)
Model.1<-lmer(Math ~ActiveTime+(1|Classroom),
data=Math.Data, REML=FALSE)
summary(Model.1)
## Linear mixed model fit by maximum likelihood . t-tests use Satterthwaite's
## method [lmerModLmerTest]
## Formula: Math ~ ActiveTime + (1 | Classroom)
## Data: Math.Data
##
## AIC BIC logLik deviance df.resid
## 399.3 408.8 -195.6 391.3 76
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.13111 -0.77650 -0.04481 0.61218 2.50097
##
## Random effects:
## Groups Name Variance Std.Dev.
## Classroom (Intercept) 200.00 14.142
## Residual 5.61 2.368
## Number of obs: 80, groups: Classroom, 4
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) 58.862 7.261 4.315 8.107 0.000907 ***
## ActiveTime 11.269 3.140 77.695 3.589 0.000579 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr)
## ActiveTime -0.224
plot_model(Model.1, type ='re',
facet.grid = FALSE,
sort.est = "sort.all",
y.offset = .4)
library(effects)
Results.Model.1<-Effect(c("ActiveTime"),Model.1,
xlevels=list(ActiveTime=seq(0,1,.2)))
Results.Model.1<-as.data.frame(Results.Model.1)
Final.Fixed.Plot.1 <-ggplot(data = Results.Model.1,
aes(x = ActiveTime, y =fit))+
geom_line(size=2)+
coord_cartesian(xlim = c(0, 1),ylim = c(0, 100))+
geom_ribbon(aes(ymin=fit-se, ymax=fit+se),alpha=.2)+
xlab("Proportion of Time Engaged in Active Learning")+
ylab("Math Score")+
theme(legend.position = "none")
Final.Fixed.Plot.1
#Fixed only, removed random effects
Math.Data$Model.1.Fitted<-predict(Model.1, re.form=NA)
ClassRoom.Plot.Bad <-ggplot(data = Math.Data)+
coord_cartesian(xlim = c(0, 1),ylim = c(0, 100))+
geom_point( aes(x = ActiveTime, y=Math,
colour = Classroom, shape=Classroom))+
geom_line(aes(x = ActiveTime, y=Model.1.Fitted))+
xlab("Proportion of Time Engaged in Active Learning")+
ylab("Math Score")+
theme(legend.position = "top")
ClassRoom.Plot.Bad
\[ R^2_{Marginal} = \frac{\sigma^2_{fixed}} {\sigma^2_{fixed}+\sigma^2_{random}+\sigma^2_{residual}} \]
source('RsquaredHelper.R')
rsquared.glmm(Model.1)
## Class Family Link Marginal Conditional AIC
## 1 lmerModLmerTest gaussian identity 0.04359631 0.9739071 399.2721
The Marginal \(R^2\) is terrible.
Often you cannot simply add a scatterplot. However, there are ways around this (but you end up plotting every classroom)
There is a fast and dirty method (but it will not work well when you have lots of control variables)
# Fixed + Random effects
Math.Data$Model.1.Fitted.Random<-predict(Model.1)
ClassRoom.Plot.Better <-ggplot(data = Math.Data)+
coord_cartesian(xlim = c(0, 1),ylim = c(0, 100))+
geom_point(aes(x = ActiveTime, y=Math,
colour = Classroom, shape=Classroom))+
geom_line(aes(x = ActiveTime, y=Model.1.Fitted.Random,
colour = Classroom))+
xlab("Proportion of Time Engaged in Active Learning")+
ylab("Math Score")+
theme(legend.position = "top")
ClassRoom.Plot.Better
This is what the model basically fit (controling for the random effect)!
a more accurate way to graph it is show the data in its clusters
ClassRoom.Plot.Best <-ggplot(data = Math.Data, group=Classroom)+
facet_grid(Classroom~.)+
coord_cartesian(xlim = c(0, 1),ylim = c(0, 100))+
geom_point(aes(x = ActiveTime, y=Math,
colour = Classroom, shape=Classroom))+
geom_line(aes(x = ActiveTime, y=Model.1.Fitted.Random,
colour = Classroom))+
xlab("Proportion of Time Engaged in Active Learning")+
ylab("Math Score")+
theme(legend.position = "top")
ClassRoom.Plot.Best
\[ R^2_{conditional} = \frac{\sigma^2_{fixed}+\sigma^2_{random}} {\sigma^2_{fixed}+\sigma^2_{random}+\sigma^2_{residual}} \]
rsquared.glmm(Model.1)
## Class Family Link Marginal Conditional AIC
## 1 lmerModLmerTest gaussian identity 0.04359631 0.9739071 399.2721
anova(Model.Null,Model.1)
## Data: Math.Data
## Models:
## Model.Null: Math ~ 1 + (1 | Classroom)
## Model.1: Math ~ ActiveTime + (1 | Classroom)
## npar AIC BIC logLik deviance Chisq Df Pr(>Chisq)
## Model.Null 3 408.59 415.74 -201.30 402.59
## Model.1 4 399.27 408.80 -195.64 391.27 11.321 1 0.0007662 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Math.Data$ActiveTime.C<-scale(Math.Data$ActiveTime, center=TRUE, scale=FALSE)
Model.1.GM<-lmer(Math ~ActiveTime.C+(1|Classroom),
data=Math.Data, REML=FALSE)
summary(Model.1.GM)
## Linear mixed model fit by maximum likelihood . t-tests use Satterthwaite's
## method [lmerModLmerTest]
## Formula: Math ~ ActiveTime.C + (1 | Classroom)
## Data: Math.Data
##
## AIC BIC logLik deviance df.resid
## 399.3 408.8 -195.6 391.3 76
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.13111 -0.77650 -0.04481 0.61218 2.50097
##
## Random effects:
## Groups Name Variance Std.Dev.
## Classroom (Intercept) 200.00 14.142
## Residual 5.61 2.368
## Number of obs: 80, groups: Classroom, 4
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) 64.704 7.076 3.897 9.144 0.000896 ***
## ActiveTime.C 11.269 3.140 77.695 3.589 0.000579 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr)
## ActiveTim.C 0.000
Math.Data$Model.1.Fitted.Random.GM<-predict(Model.1.GM) #plots fixed Random effects
ClassRoom.Plot.Better <-ggplot()+
coord_cartesian(xlim = c(-.5,.5),ylim = c(0, 100))+
geom_point(data = Math.Data, aes(x = ActiveTime.C, y=Math,
colour = Classroom, shape=Classroom))+
geom_line(data = Math.Data, aes(x = ActiveTime.C, y=Model.1.Fitted.Random.GM,
colour = Classroom))+
xlab("Proportion of Time Engaged in Active Learning\n(Grand Mean Centered)")+
ylab("Math Score")+
theme(legend.position = "top")
ClassRoom.Plot.Better
### Rescale IV
library(plyr)
Math.Data<-ddply(Math.Data,.(Classroom), mutate, ClassActiveTime = mean(ActiveTime))
Math.Data$ActiveTime.Class.Centered<-Math.Data$ActiveTime-Math.Data$ClassActiveTime
Model.1.GC<-lmer(Math ~ActiveTime.Class.Centered+(1|Classroom),
data=Math.Data, REML=FALSE)
summary(Model.1.GC)
## Linear mixed model fit by maximum likelihood . t-tests use Satterthwaite's
## method [lmerModLmerTest]
## Formula: Math ~ ActiveTime.Class.Centered + (1 | Classroom)
## Data: Math.Data
##
## AIC BIC logLik deviance df.resid
## 397.4 406.9 -194.7 389.4 76
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.16057 -0.78325 -0.04672 0.62451 2.50992
##
## Random effects:
## Groups Name Variance Std.Dev.
## Classroom (Intercept) 126.937 11.267
## Residual 5.606 2.368
## Number of obs: 80, groups: Classroom, 4
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) 64.704 5.640 4.000 11.473 0.000329 ***
## ActiveTime.Class.Centered 11.990 3.159 76.000 3.795 0.000295 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr)
## ActvTm.Cl.C 0.000
Math.Data$Model.1.Fitted.Random.GC<-predict(Model.1.GC) #plots fixed Random effects
ClassRoom.Plot.Better <-ggplot()+
coord_cartesian(xlim = c(-.3,.3),ylim = c(0, 100))+
geom_point(data = Math.Data, aes(x = ActiveTime.Class.Centered, y=Math,
colour = Classroom, shape=Classroom))+
geom_line(data = Math.Data, aes(x = ActiveTime.Class.Centered, y=Model.1.Fitted.Random.GC,
colour = Classroom))+
xlab("Proportion of Time Engaged in Active Learning\n(Centered per Classroom)")+
ylab("Math Score")+
theme(legend.position = "top")
ClassRoom.Plot.Better