Hi all,
here is the short code snipped to scrape the Italian names distribution we mentioned in class. try it out!
library(rvest)
r = read_html("https://www.mappadeicognomi.it/classifica_cognomi.php")
rt = r |> html_element("#classifica-cognomi") |> html_table()
# get the other 9 pages
for (i in 2:10){
x = read_html(glue::glue("https://www.mappadeicognomi.it/classifica_cognomi.php?p={i}")) |>
html_element("#classifica-cognomi") |> html_table()
rt = rbind(rt,x) # "row bind" one data.frame to the next
}
rt = rt[-1, ]
names(rt) <- c("rank","surname","number")
rt$number = as.integer(rt$number)
# drop "prossimi cento" etc
rt = rt[complete.cases(rt), ]
# sort alphabetically
rt = rt[order(rt$surname),]
# compute cumsum
rt$csum = cumsum(rt$number)
# find median number of occurences of names
med = max(rt$csum) / 2
# median name
idx = which.min(abs(rt$csum - med))
plot(1:nrow(rt), rt$csum, main = paste("median name:", rt[idx,"surname"]),
xlab = "name index", ylab = "number of occurences")
abline(v = idx)
axis(1, at = idx)