## ----include = FALSE---------------------------------------------------------- knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) ## ----setup-------------------------------------------------------------------- library(avilistr) library(dplyr) library(ggplot2) library(tidyverse) ## ----load-data---------------------------------------------------------------- # Load the datasets data(avilist_2025) # Complete dataset (26 fields) data(avilist_2025_short) # Essential fields (~12 fields) data(avilist_metadata) # Field descriptions # Check data dimensions cat("Full dataset:", nrow(avilist_2025), "records,", ncol(avilist_2025), "fields\n") cat("Short dataset:", nrow(avilist_2025_short), "records,", ncol(avilist_2025_short), "fields\n") ## ----explore-basic------------------------------------------------------------ # Count records by taxonomic rank avilist_2025_short %>% count(Taxon_rank, sort = TRUE) ## ----explore-taxonomy--------------------------------------------------------- # Count species by order (top 10) species_by_order <- avilist_2025_short %>% filter(Taxon_rank == "species") %>% count(Order, sort = TRUE) %>% head(10) print(species_by_order) ## ----plot-orders, fig.width=8, fig.height=5----------------------------------- # Visualize most diverse orders ggplot(species_by_order, aes(x = reorder(Order, n), y = n)) + geom_col(fill = "steelblue", alpha = 0.8) + coord_flip() + labs( title = "Most Species-Rich Bird Orders", subtitle = "Top 10 orders by number of species", x = "Order", y = "Number of Species", caption = "Data: AviList Global Avian Checklist v2025" ) + theme_minimal() ## ----explore-families--------------------------------------------------------- # Most diverse bird families family_richness <- avilist_2025_short %>% filter(Taxon_rank == "species") %>% count(Family, Family_English_name, sort = TRUE) %>% head(15) print(family_richness) ## ----plot-families, fig.width=10, fig.height=6-------------------------------- # Visualize family diversity ggplot(family_richness, aes(x = reorder(Family_English_name, n), y = n)) + geom_col(fill = "darkgreen", alpha = 0.8) + coord_flip() + labs( title = "Most Species-Rich Bird Families", subtitle = "Top 15 families by number of species", x = "Family", y = "Number of Species", caption = "Data: AviList Global Avian Checklist v2025" ) + theme_minimal() + theme(axis.text.y = element_text(size = 10)) ## ----filter-examples---------------------------------------------------------- # Get all thrush species thrushes <- avilist_2025_short %>% filter(Family == "Turdidae", Taxon_rank == "species") %>% select(Scientific_name, English_name_AviList) cat("Number of thrush species:", nrow(thrushes), "\n") head(thrushes) ## ----filter-raptors----------------------------------------------------------- # Get all raptors (birds of prey) raptor_families <- c("Accipitridae", "Falconidae", "Strigidae", "Tytonidae") raptors <- avilist_2025_short %>% filter(Family %in% raptor_families, Taxon_rank == "species") %>% count(Family, Family_English_name, sort = TRUE) print(raptors) ## ----pattern-matching--------------------------------------------------------- # Find species with "Robin" in their name robins <- avilist_2025_short %>% filter(str_detect(English_name_AviList, "Robin"), Taxon_rank == "species") %>% select(Scientific_name, English_name_AviList, Family) %>% arrange(Family) print(robins) ## ----genus-search------------------------------------------------------------- # Explore a specific genus (Turdus) turdus_species <- avilist_2025_short %>% filter(str_detect(Scientific_name, "^Turdus "), Taxon_rank == "species") %>% select(Scientific_name, English_name_AviList) %>% arrange(Scientific_name) cat("Number of Turdus species:", nrow(turdus_species), "\n") head(turdus_species, 10) ## ----data-quality------------------------------------------------------------- # Summary of data completeness data_completeness <- avilist_2025 %>% summarise( total_records = n(), missing_scientific_names = sum(is.na(Scientific_name)), missing_families = sum(is.na(Family)), missing_orders = sum(is.na(Order)), missing_avilist_names = sum(is.na(English_name_AviList)) ) print(data_completeness) ## ----name-comparison---------------------------------------------------------- # Compare AviList vs Clements naming name_comparison <- avilist_2025 %>% filter(Taxon_rank == "species") %>% summarise( total_species = n(), has_avilist_name = sum(!is.na(English_name_AviList)), has_clements_name = sum(!is.na(English_name_Clements_v2024)), has_both_names = sum(!is.na(English_name_AviList) & !is.na(English_name_Clements_v2024)), names_differ = sum(English_name_AviList != English_name_Clements_v2024, na.rm = TRUE) ) print(name_comparison) ## ----name-differences--------------------------------------------------------- # Examples where names differ between sources name_differences <- avilist_2025 %>% filter( Taxon_rank == "species", !is.na(English_name_AviList), !is.na(English_name_Clements_v2024), English_name_AviList != English_name_Clements_v2024 ) %>% select(Scientific_name, English_name_AviList, English_name_Clements_v2024) %>% head(10) print(name_differences) ## ----performance-tips--------------------------------------------------------- # For large analyses, use the short dataset when possible system.time({ short_analysis <- avilist_2025_short %>% filter(Taxon_rank == "species") %>% count(Order) }) # Filter early to reduce data size songbirds <- avilist_2025_short %>% filter(Order == "Passeriformes", Taxon_rank == "species") cat("Songbird species:", nrow(songbirds), "\n") # Select only needed columns to reduce memory usage essential_fields <- avilist_2025 %>% select(Scientific_name, English_name_AviList, Family, Order, Taxon_rank) cat("Memory usage reduced from", ncol(avilist_2025), "to", ncol(essential_fields), "columns\n") ## ----taxize-example, eval=FALSE----------------------------------------------- # library(taxize) # # # Get a sample of species for validation # sample_species <- avilist_2025_short %>% # filter(Family == "Turdidae", Taxon_rank == "species") %>% # pull(Scientific_name) %>% # head(5) # # # Validate names with GBIF (commented out to avoid API calls in vignette) # # gbif_validation <- get_gbifid(sample_species) ## ----rebird-example, eval=FALSE----------------------------------------------- # library(rebird) # # # Get Cornell Lab species codes from full dataset # thrush_codes <- avilist_2025 %>% # filter(Family == "Turdidae", Taxon_rank == "species") %>% # select(Scientific_name, Species_code_Cornell_Lab) %>% # filter(!is.na(Species_code_Cornell_Lab)) # # # Example: Get recent observations (commented out to avoid API calls) # # recent_thrushes <- ebirdregion("US-NY", species = thrush_codes$Species_code_Cornell_Lab[1]) ## ----taxonomic-patterns------------------------------------------------------- # Find monotypic genera (genera with only one species) monotypic_genera <- avilist_2025_short %>% filter(Taxon_rank == "species") %>% mutate(genus = str_extract(Scientific_name, "^[A-Z][a-z]+")) %>% count(genus, Family) %>% filter(n == 1) %>% arrange(Family) cat("Number of monotypic genera:", nrow(monotypic_genera), "\n") # Genera per family monotypic_summary <- monotypic_genera %>% count(Family, name = "monotypic_genera") %>% arrange(desc(monotypic_genera)) %>% head(10) print(monotypic_summary) ## ----geographic-analysis------------------------------------------------------ # Analyze type localities (where species were first described) type_localities <- avilist_2025 %>% filter(Taxon_rank == "species", !is.na(Type_locality)) %>% mutate( continent = case_when( str_detect(Type_locality, regex("Australia|New Zealand", ignore_case = TRUE)) ~ "Australasia", str_detect(Type_locality, regex("Europe|European", ignore_case = TRUE)) ~ "Europe", str_detect(Type_locality, regex("Africa|African", ignore_case = TRUE)) ~ "Africa", str_detect(Type_locality, regex("Asia|Asian|China|Japan|India", ignore_case = TRUE)) ~ "Asia", str_detect(Type_locality, regex("America|Brazil|Peru|Mexico|Canada|USA", ignore_case = TRUE)) ~ "Americas", TRUE ~ "Other" ) ) %>% count(continent, sort = TRUE) print(type_localities) ## ----metadata-exploration----------------------------------------------------- # Understand the available fields print(avilist_metadata) # Fields available in short vs full dataset cat("Fields in short dataset:\n") short_fields <- avilist_metadata %>% filter(in_short_version) %>% pull(field_name) cat(paste(short_fields, collapse = ", "), "\n\n") cat("Additional fields in full dataset:\n") full_only_fields <- avilist_metadata %>% filter(in_full_version & !in_short_version) %>% pull(field_name) cat(paste(full_only_fields, collapse = ", "), "\n")