Web table scraping
Adapted from here. more on RPubs.
1 2 3 4 5 6 7 8 9 |
library(XML) library(RCurl) library(rlist) theurl <- getURL("https://en.wikipedia.org/wiki/Brazil_national_football_team",.opts = list(ssl.verifypeer = FALSE) ) tables <- readHTMLTable(theurl) tables <- list.clean(tables, fun = is.null, recursive = FALSE) n.rows <- unlist(lapply(tables, function(t) dim(t)[1])) |
NB: RCurl needed, or following messages are evoked.
Warning message: XML content does not seem to be XML: ‘https://en.wikipedia.org/wiki/List_of_countries_and_dependencies_by_population’
Xpath
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 |
library(RCurl) library(XML) theurl <- "http://en.wikipedia.org/wiki/Brazil_national_football_team" webpage <- getURL(theurl) webpage <- readLines(tc <- textConnection(webpage)); close(tc) pagetree <- htmlTreeParse(webpage, error=function(...){}, useInternalNodes = TRUE) # Extract table header and contents tablehead <- xpathSApply(pagetree, "//*/table[@class='wikitable sortable']/tr/th", xmlValue) results <- xpathSApply(pagetree, "//*/table[@class='wikitable sortable']/tr/td", xmlValue) # Convert character vector to dataframe content <- as.data.frame(matrix(results, ncol = 8, byrow = TRUE)) # Clean up the results content[,1] <- gsub("Â ", "", content[,1]) tablehead <- gsub("Â ", """, tablehead) names(content) <- tablehead |
rvest and xml2
1 2 3 4 5 6 7 |
library(rvest) theurl <- "http://en.wikipedia.org/wiki/Brazil_national_football_team" file<-read_html(theurl) tables<-html_nodes(file, "table") table1 <- html_table(tables[4], fill = TRUE) |
1914–1917 | 1917 | 1917 | 1917 | 1918–1919 | 1919–1938 | 1945–1949 | 1954–1974 |
26
MAY
MAY
About the Author:
Beyond 8 hours - Computer, Sports, Family...