This R notebook can be used as a template to compute regression with a single categorical explanatory variable. It is usable in Rstudio. The content is adapated from the mooc Multilevel modelling online course from the Centre for Mutlilevel Modelling at the University of Bristol (http://www.bristol.ac.uk/cmm/learning/online-course/).
library(xlsx) #read xlsx files
#Text file
mydata <- read.table(file = "3.2.txt", sep = ",", header = TRUE) #put the file in the same directory as this notebook
#Excel file
#mydata <- read.xlsx("myfile.xlsx", sheetName = "mysheetname")
dim(mydata) #number of rows and columns
## [1] 33988 5
str(mydata) #basic description of the dataframe
## 'data.frame': 33988 obs. of 5 variables:
## $ caseid : int 339 340 345 346 352 353 354 361 362 363 ...
## $ score : int 49 18 46 43 17 29 15 19 45 12 ...
## $ cohort90: int -6 -6 -6 -6 -6 -6 -6 -6 -6 -6 ...
## $ female : int 0 0 0 0 0 0 0 0 0 0 ...
## $ sclass : int 2 3 4 3 3 2 3 2 3 1 ...
mydata[1:20, ] #first 20 rows of the data
## caseid score cohort90 female sclass
## 1 339 49 -6 0 2
## 2 340 18 -6 0 3
## 3 345 46 -6 0 4
## 4 346 43 -6 0 3
## 5 352 17 -6 0 3
## 6 353 29 -6 0 2
## 7 354 15 -6 0 3
## 8 361 19 -6 0 2
## 9 362 45 -6 0 3
## 10 363 12 -6 0 1
## 11 6824 0 -4 0 1
## 12 6826 0 -4 0 3
## 13 6827 20 -4 0 2
## 14 6828 32 -4 0 1
## 15 6829 0 -4 0 2
## 16 6834 24 -4 0 3
## 17 6836 23 -4 0 2
## 18 13206 7 -2 0 3
## 19 13209 38 -2 0 3
## 20 13215 46 -2 0 1
Having viewed the data we will now examine score (ratio variable : point score calculated from awards in Standard grades. Scores range from 0 to 75, with a higher score indicating a higher attainment ) and female (categorical variable : girls 1; boys 0
#continuous variable
hist(mydata$score, xlim = c(0,80))
### Summary of the data
summary(mydata$score)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 19.00 33.00 31.09 45.00 75.00
sd(mydata$score)
## [1] 17.31437
mytable <- table(mydata$female)
mytablecomb <- cbind(mytable, prop.table(mytable),
cumsum(prop.table(mytable)))
colnames(mytablecomb) <- c("Freq", "Perc", "Cum")
mytablecomb
## Freq Perc Cum
## 0 16055 0.4723726 0.4723726
## 1 17933 0.5276274 1.0000000
l <- tapply(mydata$score, factor(mydata$female), length)
m <- tapply(mydata$score, factor(mydata$female), mean)
s <- tapply(mydata$score, factor(mydata$female), sd)
tableScore <- cbind("Freq" = l, "mean(score)" = m, "sd(score)" = s)
tableScore <- rbind(tableScore, c(sum(tableScore[, 1]), mean(mydata$score),
sd(mydata$score)))
rownames(tableScore)[dim(tableScore)[1]] <- "Total"
tableScore
## Freq mean(score) sd(score)
## 0 16055 29.82523 17.30544
## 1 17933 32.23108 17.24374
## Total 33988 31.09462 17.31437
\[dependent_i = \beta_0 + \beta_1 * explanatory_i + e_i\]
fit <- lm(score ~ female, data = mydata)
summary(fit)
##
## Call:
## lm(formula = score ~ female, data = mydata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -32.231 -12.231 1.769 13.769 45.175
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 29.8252 0.1363 218.79 <2e-16 ***
## female 2.4059 0.1877 12.82 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 17.27 on 33986 degrees of freedom
## Multiple R-squared: 0.004812, Adjusted R-squared: 0.004783
## F-statistic: 164.3 on 1 and 33986 DF, p-value: < 2.2e-16
\[ \hat{score}_i = 29.83 + 2.41 * female_i\]
model <- summary(fit)
model$sigma^2
## [1] 298.3536
model$r.squared
## [1] 0.004812279
The test statistic is calculated by dividing the estimated slope by its standard error
model$coefficients[2,1] / model$coefficients[2,2]
## [1] 12.81955
#significant if larger than 1.96 or -1.96, the critical value at the 5% level of significance
We will now examine score (ratio variable : point score calculated from awards in Standard grades. Scores range from 0 to 75, with a higher score indicating a higher attainment) and parental social class (categorical variable : managerial and professional 1; intermediate 2; working 3; unclassified 4)
#continuous variable
hist(mydata$score, xlim = c(0,80))
### Summary of the data
summary(mydata$score)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 19.00 33.00 31.09 45.00 75.00
sd(mydata$score)
## [1] 17.31437
l <- tapply(mydata$score, factor(mydata$sclass), length)
m <- tapply(mydata$score, factor(mydata$sclass), mean)
s <- tapply(mydata$score, factor(mydata$sclass), sd)
tableScore <- cbind("Freq" = l, "mean(score)" = m, "sd(score)" = s)
tableScore <- rbind(tableScore, c(sum(tableScore[, 1]), mean(mydata$score),
sd(mydata$score)))
rownames(tableScore)[dim(tableScore)[1]] <- "Total"
tableScore
## Freq mean(score) sd(score)
## 1 11173 39.24523 15.23131
## 2 9994 31.87142 16.09893
## 3 9486 24.10183 16.25010
## 4 3335 21.35052 17.08918
## Total 33988 31.09462 17.31437
mydata$sclass1 <- mydata$sclass == 1
mydata$sclass2 <- mydata$sclass == 2
mydata$sclass3 <- mydata$sclass == 3
mydata$sclass4 <- mydata$sclass == 4
mydata[1:20, ]
## caseid score cohort90 female sclass sclass1 sclass2 sclass3 sclass4
## 1 339 49 -6 0 2 FALSE TRUE FALSE FALSE
## 2 340 18 -6 0 3 FALSE FALSE TRUE FALSE
## 3 345 46 -6 0 4 FALSE FALSE FALSE TRUE
## 4 346 43 -6 0 3 FALSE FALSE TRUE FALSE
## 5 352 17 -6 0 3 FALSE FALSE TRUE FALSE
## 6 353 29 -6 0 2 FALSE TRUE FALSE FALSE
## 7 354 15 -6 0 3 FALSE FALSE TRUE FALSE
## 8 361 19 -6 0 2 FALSE TRUE FALSE FALSE
## 9 362 45 -6 0 3 FALSE FALSE TRUE FALSE
## 10 363 12 -6 0 1 TRUE FALSE FALSE FALSE
## 11 6824 0 -4 0 1 TRUE FALSE FALSE FALSE
## 12 6826 0 -4 0 3 FALSE FALSE TRUE FALSE
## 13 6827 20 -4 0 2 FALSE TRUE FALSE FALSE
## 14 6828 32 -4 0 1 TRUE FALSE FALSE FALSE
## 15 6829 0 -4 0 2 FALSE TRUE FALSE FALSE
## 16 6834 24 -4 0 3 FALSE FALSE TRUE FALSE
## 17 6836 23 -4 0 2 FALSE TRUE FALSE FALSE
## 18 13206 7 -2 0 3 FALSE FALSE TRUE FALSE
## 19 13209 38 -2 0 3 FALSE FALSE TRUE FALSE
## 20 13215 46 -2 0 1 TRUE FALSE FALSE FALSE
\[score_i = \beta_0 + \beta_1 * sclass1_i + \beta_2 * sclass2_i + \beta_3 * sclass4_i + e_i\]
fit <- lm(score ~ sclass1 + sclass2 + sclass4, data = mydata)
summary(fit)
##
## Call:
## lm(formula = score ~ sclass1 + sclass2 + sclass4, data = mydata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -39.245 -11.245 1.755 12.129 50.898
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 24.1018 0.1639 147.051 <2e-16 ***
## sclass1TRUE 15.1434 0.2229 67.947 <2e-16 ***
## sclass2TRUE 7.7696 0.2288 33.954 <2e-16 ***
## sclass4TRUE -2.7513 0.3214 -8.561 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.96 on 33984 degrees of freedom
## Multiple R-squared: 0.15, Adjusted R-squared: 0.15
## F-statistic: 2000 on 3 and 33984 DF, p-value: < 2.2e-16
\[ \hat{score}_i = 24.1 + 15.14 * sclass1_i + 7.77 * sclass2_i - 2.75 * sclass4_i\] ### Residual (unexplained) variance
model <- summary(fit)
model$sigma^2
## [1] 254.8285
model$r.squared
## [1] 0.1500447
The test statistic is calculated by dividing the estimated slope by its standard error
#sclass3 vs.sclass1
model$coefficients[2,1] / model$coefficients[2,2]
## [1] 67.94715
#sclass3 vs.sclass2
model$coefficients[3,1] / model$coefficients[3,2]
## [1] 33.95399
#sclass3 vs.sclass4
model$coefficients[4,1] / model$coefficients[4,2]
## [1] -8.561385
#significant if larger than 1.96 or -1.96, the critical value at the 5% level of significance
\[score_i = \beta_0 + \beta_1 * sclass1_i + \beta_2 * sclass3_i + \beta_3 * sclass4_i + e_i\]
fit <- lm(score ~ sclass1 + sclass3 + sclass4, data = mydata)
summary(fit)
##
## Call:
## lm(formula = score ~ sclass1 + sclass3 + sclass4, data = mydata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -39.245 -11.245 1.755 12.129 50.898
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 31.8714 0.1597 199.59 <2e-16 ***
## sclass1TRUE 7.3738 0.2198 33.55 <2e-16 ***
## sclass3TRUE -7.7696 0.2288 -33.95 <2e-16 ***
## sclass4TRUE -10.5209 0.3192 -32.96 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.96 on 33984 degrees of freedom
## Multiple R-squared: 0.15, Adjusted R-squared: 0.15
## F-statistic: 2000 on 3 and 33984 DF, p-value: < 2.2e-16
\[ \hat{score}_i = 31.87 + 7.37 * sclass1_i - 7.77 * sclass3_i - 10.52 * sclass4_i\] ### Residual (unexplained) variance
model <- summary(fit)
model$sigma^2
## [1] 254.8285
model$r.squared
## [1] 0.1500447
The test statistic is calculated by dividing the estimated slope by its standard error
#sclass2 vs.sclass1
model$coefficients[2,1] / model$coefficients[2,2]
## [1] 33.55003
#sclass2 vs.sclass3
model$coefficients[3,1] / model$coefficients[3,2]
## [1] -33.95399
#sclass2 vs.sclass4
model$coefficients[4,1] / model$coefficients[4,2]
## [1] -32.957
#significant if larger than 1.96 or -1.96, the critical value at the 5% level of significance
We will now examine score (ratio variable : point score calculated from awards in Standard grades. Scores range from 0 to 75, with a higher score indicating a higher attainment ) and cohort90 taken as a categorical variable (cohorts: 1984, 1986, 1988, 1990, 1996 and 1998). The cohort90 variable is calculated by subtracting 1990 from each value. Thus values range from -6 (corresponding to 1984) to 8 (1998), with 1990 coded as zero.
table(mydata$cohort90)
##
## -6 -4 -2 0 6 8
## 6478 6325 5245 4371 4244 7325
mydata$cohort90yr84 <- mydata$cohort90 == -6
mydata$cohort90yr86 <- mydata$cohort90 == -4
mydata$cohort90yr88 <- mydata$cohort90 == -2
mydata$cohort90yr90 <- mydata$cohort90 == 0
mydata$cohort90yr96 <- mydata$cohort90 == 6
mydata$cohort90yr98 <- mydata$cohort90 == 8
mydata[1:20, ]
## caseid score cohort90 female sclass sclass1 sclass2 sclass3 sclass4
## 1 339 49 -6 0 2 FALSE TRUE FALSE FALSE
## 2 340 18 -6 0 3 FALSE FALSE TRUE FALSE
## 3 345 46 -6 0 4 FALSE FALSE FALSE TRUE
## 4 346 43 -6 0 3 FALSE FALSE TRUE FALSE
## 5 352 17 -6 0 3 FALSE FALSE TRUE FALSE
## 6 353 29 -6 0 2 FALSE TRUE FALSE FALSE
## 7 354 15 -6 0 3 FALSE FALSE TRUE FALSE
## 8 361 19 -6 0 2 FALSE TRUE FALSE FALSE
## 9 362 45 -6 0 3 FALSE FALSE TRUE FALSE
## 10 363 12 -6 0 1 TRUE FALSE FALSE FALSE
## 11 6824 0 -4 0 1 TRUE FALSE FALSE FALSE
## 12 6826 0 -4 0 3 FALSE FALSE TRUE FALSE
## 13 6827 20 -4 0 2 FALSE TRUE FALSE FALSE
## 14 6828 32 -4 0 1 TRUE FALSE FALSE FALSE
## 15 6829 0 -4 0 2 FALSE TRUE FALSE FALSE
## 16 6834 24 -4 0 3 FALSE FALSE TRUE FALSE
## 17 6836 23 -4 0 2 FALSE TRUE FALSE FALSE
## 18 13206 7 -2 0 3 FALSE FALSE TRUE FALSE
## 19 13209 38 -2 0 3 FALSE FALSE TRUE FALSE
## 20 13215 46 -2 0 1 TRUE FALSE FALSE FALSE
## cohort90yr84 cohort90yr86 cohort90yr88 cohort90yr90 cohort90yr96
## 1 TRUE FALSE FALSE FALSE FALSE
## 2 TRUE FALSE FALSE FALSE FALSE
## 3 TRUE FALSE FALSE FALSE FALSE
## 4 TRUE FALSE FALSE FALSE FALSE
## 5 TRUE FALSE FALSE FALSE FALSE
## 6 TRUE FALSE FALSE FALSE FALSE
## 7 TRUE FALSE FALSE FALSE FALSE
## 8 TRUE FALSE FALSE FALSE FALSE
## 9 TRUE FALSE FALSE FALSE FALSE
## 10 TRUE FALSE FALSE FALSE FALSE
## 11 FALSE TRUE FALSE FALSE FALSE
## 12 FALSE TRUE FALSE FALSE FALSE
## 13 FALSE TRUE FALSE FALSE FALSE
## 14 FALSE TRUE FALSE FALSE FALSE
## 15 FALSE TRUE FALSE FALSE FALSE
## 16 FALSE TRUE FALSE FALSE FALSE
## 17 FALSE TRUE FALSE FALSE FALSE
## 18 FALSE FALSE TRUE FALSE FALSE
## 19 FALSE FALSE TRUE FALSE FALSE
## 20 FALSE FALSE TRUE FALSE FALSE
## cohort90yr98
## 1 FALSE
## 2 FALSE
## 3 FALSE
## 4 FALSE
## 5 FALSE
## 6 FALSE
## 7 FALSE
## 8 FALSE
## 9 FALSE
## 10 FALSE
## 11 FALSE
## 12 FALSE
## 13 FALSE
## 14 FALSE
## 15 FALSE
## 16 FALSE
## 17 FALSE
## 18 FALSE
## 19 FALSE
## 20 FALSE
\[score_i = \beta_0 + \beta_1 * cohort90yr84_i + \beta_2 * cohort90yr86_i + \\ \beta_3 * cohort90yr90_i + \beta_4 * cohort90yr96_i + \beta_5 * cohort90yr98_i + e_i\]
fit <- lm(score ~ cohort90yr84 + cohort90yr86 + cohort90yr90 + cohort90yr96 + cohort90yr98, data = mydata)
summary(fit)
##
## Call:
## lm(formula = score ~ cohort90yr84 + cohort90yr86 + cohort90yr90 +
## cohort90yr96 + cohort90yr98, data = mydata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -41.331 -11.685 0.565 12.227 45.345
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 28.5245 0.2179 130.905 <2e-16 ***
## cohort90yr84TRUE -4.8691 0.2931 -16.611 <2e-16 ***
## cohort90yr86TRUE -3.7519 0.2947 -12.731 <2e-16 ***
## cohort90yr90TRUE 0.5759 0.3232 1.782 0.0748 .
## cohort90yr96TRUE 10.9102 0.3258 33.485 <2e-16 ***
## cohort90yr98TRUE 12.8061 0.2854 44.864 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.78 on 33982 degrees of freedom
## Multiple R-squared: 0.1694, Adjusted R-squared: 0.1693
## F-statistic: 1386 on 5 and 33982 DF, p-value: < 2.2e-16
\[ \hat{score}_i = 28.52 + -4.87 * cohort90yr84_i - 3.75 * cohort90yr86_i + \\ 0.58 * cohort90yr86_i + 10.91 * cohort90yr96_i + 12.81 * cohort90yr98_i\]
The coefficients increase with year, as assumed in a linear model. We can treat cohort90 as continuous in all subsequent analyses.