Perfilado de sección

    • Hi all,

      here is the short code snipped to scrape the Italian names distribution we mentioned in class. try it out! 

      library(rvest)
      
      r = read_html("https://www.mappadeicognomi.it/classifica_cognomi.php")
      rt = r |> html_element("#classifica-cognomi") |> html_table() 
      
      # get the other 9 pages
      for (i in 2:10){
          x = read_html(glue::glue("https://www.mappadeicognomi.it/classifica_cognomi.php?p={i}")) |>
              html_element("#classifica-cognomi") |> html_table() 
          rt = rbind(rt,x) # "row bind" one data.frame to the next
      }
      rt = rt[-1, ]
      names(rt) <- c("rank","surname","number")
      rt$number = as.integer(rt$number)
      
      # drop "prossimi cento" etc
      rt = rt[complete.cases(rt), ]
      
      # sort alphabetically
      rt = rt[order(rt$surname),]
      
      # compute cumsum
      rt$csum = cumsum(rt$number)
      
      # find median number of occurences of names
      med = max(rt$csum) / 2
      
      # median name
      idx = which.min(abs(rt$csum - med))
      
      plot(1:nrow(rt), rt$csum, main = paste("median name:", rt[idx,"surname"]),
           xlab = "name index", ylab = "number of occurences")
      abline(v = idx)
      axis(1, at = idx)