d = read.csv("star_data.csv")
head(d)

# crucial to get rid of NAs
d = d[complete.cases(d), ]

library(dplyr)
d %>% group_by(star, grade) %>%
    summarise(meanfemale = mean(gender == "female"),
              share_afram = mean(ethnicity == "afam"))

# create a `small` indicator
star_df_k_small <- d %>%
    filter(star %in% c("regular", "small") &
               grade == "k") %>%
    mutate(small = (star == "small"))

# run the regression
lm(math ~ small, star_df_k_small)


# TAsk 2

star_df_clean <- d %>%
    filter(grade == "1" & star %in% c("small", "regular"))

# avg math score for both groups
# using base R
m_small = mean( star_df_clean[ star_df_clean$star == "small", "math" ] )
m_reg = mean( star_df_clean[ star_df_clean$star == "regular", "math" ] )
q2 = m_small - m_reg

# identical, but nicer to read?
star_df_clean %>%
    group_by(star) %>%
    summarise(meanmath = mean(math))

# create a treatment indicator:
star_df_clean = star_df_clean %>%
    mutate(treatment = (star == "small") )

# run regression of math score on treatment
lm(math ~ treatment , data = star_df_clean)  # linear model (y = a + b x + u)