Try this:
library(XML)
URL <- "http://en.wikipedia.org/wiki/Periodic_table"
root <- htmlTreeParse(URL, useInternalNodes = TRUE)
# extract attributes and value of all 'a' tags within 3rd table
f <- function(x) c(xmlAttrs(x), xmlValue(x))
m1 <- xpathApply(root, "//table[3]//a", f)
m2 <- suppressWarnings(do.call(rbind, m1))
# extract rows that correspond to chemical symbols
ix <- grep("^[[:upper:]][[:lower:]]{0,2}", m2[, "class"])
m3 <- m2[ix, 1:3]
colnames(m3) <- c("URL", "Name", "Symbol")
m3[,1] <- sub("^", "http://en.wikipedia.org", m3[,1])
m3[,2] <- sub(" .*", "", m3[,2])
A bit of the output:
> dim(m3)
[1] 118 3
> head(m3)
URL Name Symbol
[1,] "http://en.wikipedia.org/wiki/Hydrogen" "Hydrogen" "H"
[2,] "http://en.wikipedia.org/wiki/Helium" "Helium" "He"
[3,] "http://en.wikipedia.org/wiki/Lithium" "Lithium" "Li"
[4,] "http://en.wikipedia.org/wiki/Beryllium" "Beryllium" "Be"
[5,] "http://en.wikipedia.org/wiki/Boron" "Boron" "B"
[6,] "http://en.wikipedia.org/wiki/Carbon" "Carbon" "C"
We can make this more compact by enhancing the xpath expression further starting with Jeffrey's xpath expression (since it nearly gets the elements at top) and adding a qualification to it which exactly does. In that case xpathSApply
can be used to eliminate the need for do.call
or the plyr package. The last bit where we fix up odds and ends is the same as before. This produces a matrix rather than a data frame which seems preferable since the content is entirely character.
library(XML)
URL <- "http://en.wikipedia.org/wiki/Periodic_table"
root <- htmlTreeParse(URL, useInternalNodes = TRUE)
# extract attributes and value of all a tags within 3rd table
f <- function(x) c(xmlAttrs(x), xmlValue(x))
M <- t(xpathSApply(root, "//table[3]/tr/td/a[.!='']", f))[1:118,]
# nicer column names, fix up URLs, fix up Mercury.
colnames(M) <- c("URL", "Name", "Symbol")
M[,1] <- sub("^", "http://en.wikipedia.org", M[,1])
M[,2] <- sub(" .*", "", M[,2])
View(M)