Sección: Topic 10 | SEM0103 - ECONOMETRICS Canale B

Perfilado de sección

Hi all,

here is the short code snipped to scrape the Italian names distribution we mentioned in class. try it out!

library(rvest)

r = read_html("https://www.mappadeicognomi.it/classifica_cognomi.php")
rt = r |> html_element("#classifica-cognomi") |> html_table() 

# get the other 9 pages
for (i in 2:10){
    x = read_html(glue::glue("https://www.mappadeicognomi.it/classifica_cognomi.php?p={i}")) |>
        html_element("#classifica-cognomi") |> html_table() 
    rt = rbind(rt,x) # "row bind" one data.frame to the next
}
rt = rt[-1, ]
names(rt) <- c("rank","surname","number")
rt$number = as.integer(rt$number)

# drop "prossimi cento" etc
rt = rt[complete.cases(rt), ]

# sort alphabetically
rt = rt[order(rt$surname),]

# compute cumsum
rt$csum = cumsum(rt$number)

# find median number of occurences of names
med = max(rt$csum) / 2

# median name
idx = which.min(abs(rt$csum - med))

plot(1:nrow(rt), rt$csum, main = paste("median name:", rt[idx,"surname"]),
     xlab = "name index", ylab = "number of occurences")
abline(v = idx)
axis(1, at = idx)

Scuola di Management e Economia

Scuola di Management e Economia

Perfilado de sección