I am reading multiple pdf files with readtext () and I want to remove the first 10 or 20 lines of each pdf before creating the corpus and the tokens with quanteda
library(readtext)
testfiles <- readtext ("*.pdf", docvarsfrom =
"filenames",docvarnames = c("doc_type","year") ,sep= "_")
corp <- corpus(testfiles)
toks <- tokens(corp)%>% tokens_tolower()%>%
tokens(remove_punct = TRUE,
remove_separators = TRUE, remove_url = TRUE,
remove_symbols=TRUE, remove_numbers=TRUE, verbose = TRUE) %>%
tokens_remove(stopwords("en"))%>% tokens_wordstem(language =
quanteda_options("language_stemmer"))