## ----include = FALSE---------------------------------------------------------- knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) ## ----eval=FALSE--------------------------------------------------------------- # install.packages("ralger") # ## ----example------------------------------------------------------------------ library(ralger) my_link <- "http://www.shanghairanking.com/rankings/arwu/2021" my_node <- "a span" # The element ID , I recommend SelectorGadget if you're not familiar with CSS selectors clean <- TRUE # Should the function clean the extracted vector or not ? Default is FALSE best_uni <- scrap(link = my_link, node = my_node, clean = clean) head(best_uni, 10) ## ----------------------------------------------------------------------------- base_link <- "http://quotes.toscrape.com/page/" links <- paste0(base_link, 1:3) node <- ".text" head(scrap(links, node), 10) ## ----------------------------------------------------------------------------- # Getting all classes' names from the anchor elements # from the ropensci website attributes <- attribute_scrap(link = "https://ropensci.org/", node = "a", # the a tag attr = "class" # getting the class attribute ) head(attributes, 10) # NA values are a tags without a class attribute ## ----------------------------------------------------------------------------- js_depend <- attribute_scrap(link = "https://ropensci.org/", node = "script", attr = "src") js_depend ## ----------------------------------------------------------------------------- data <- table_scrap(link ="https://www.boxofficemojo.com/chart/top_lifetime_gross/?area=XWW") head(data) ## ----example3, message=FALSE, warning=FALSE----------------------------------- my_link <- "http://books.toscrape.com/catalogue/page-1.html" my_nodes <- c( "h3 > a", # Title ".price_color", # Price ".availability" # Availability ) names <- c("title", "price", "availability") # respect the order tidy_scrap(link = my_link, nodes = my_nodes, colnames = names) ## ----example4----------------------------------------------------------------- titles <- titles_scrap(link = "https://www.nytimes.com/") head(titles) ## ----------------------------------------------------------------------------- titles <- titles_scrap(link = "https://www.nytimes.com/", contain = "TrUMp", case_sensitive = FALSE) head(titles) ## ----------------------------------------------------------------------------- pgs <- paragraphs_scrap(link = "https://ropensci.org/") head(pgs) ## ----------------------------------------------------------------------------- paragraphs_scrap(link = "https://ropensci.org/", collapse = TRUE) ## ----------------------------------------------------------------------------- links <- weblink_scrap(link = "https://www.worldbank.org/en/access-to-information/reports/", contain = "PDF", case_sensitive = FALSE) head(links) ## ----------------------------------------------------------------------------- imgs <- images_preview(link = "https://posit.co/") head(imgs) ## ----eval=FALSE--------------------------------------------------------------- # # Suppose we're in a project which has a folder called my_images: # images_scrap( # link = "http://books.toscrape.com/", # imgpath = here::here("my_images"), # extn = "jpg" # images here use .jpg # ) ## ----eval=FALSE--------------------------------------------------------------- # pdf_scrap( # link = "https://www.make-it-in-germany.com/en/visa-residence/types/eu-blue-card", # path = here::here("my_pdfs") # ) ## ----eval=FALSE--------------------------------------------------------------- # csv_scrap( # link = "https://sample-files.com/data/csv/", # path = here::here("my_csvs") # ) ## ----eval=FALSE--------------------------------------------------------------- # xlsx_scrap( # link = "https://file-examples.com/index.php/sample-documents-download/sample-xls-download/", # path = here::here("my_xlsx") # ) ## ----eval=FALSE--------------------------------------------------------------- # xls_scrap( # link = "https://file-examples.com/index.php/sample-documents-download/sample-xls-download/", # path = here::here("my_xls") # ) ## ----------------------------------------------------------------------------- images_noalt_scrap(link = "https://www.r-consortium.org/") ## ----------------------------------------------------------------------------- # WebAim is the reference website for web accessibility images_noalt_scrap(link = "https://webaim.org/techniques/forms/controls") ## ----------------------------------------------------------------------------- head(comments_scrap("https://posit.co"))