d = read.csv("star_data.csv") head(d) # crucial to get rid of NAs d = d[complete.cases(d), ] library(dplyr) d %>% group_by(star, grade) %>% summarise(meanfemale = mean(gender == "female"), share_afram = mean(ethnicity == "afam")) # create a `small` indicator star_df_k_small <- d %>% filter(star %in% c("regular", "small") & grade == "k") %>% mutate(small = (star == "small")) # run the regression lm(math ~ small, star_df_k_small) # TAsk 2 star_df_clean <- d %>% filter(grade == "1" & star %in% c("small", "regular")) # avg math score for both groups # using base R m_small = mean( star_df_clean[ star_df_clean$star == "small", "math" ] ) m_reg = mean( star_df_clean[ star_df_clean$star == "regular", "math" ] ) q2 = m_small - m_reg # identical, but nicer to read? star_df_clean %>% group_by(star) %>% summarise(meanmath = mean(math)) # create a treatment indicator: star_df_clean = star_df_clean %>% mutate(treatment = (star == "small") ) # run regression of math score on treatment lm(math ~ treatment , data = star_df_clean) # linear model (y = a + b x + u)