## ----include = FALSE----------------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

## ----setup--------------------------------------------------------------------
library(avilistr)
library(dplyr)
library(ggplot2)
library(tidyverse)

## ----load-data----------------------------------------------------------------
# Load the datasets
data(avilist_2025)         # Complete dataset (26 fields)
data(avilist_2025_short)   # Essential fields (~12 fields)  
data(avilist_metadata)     # Field descriptions

# Check data dimensions
cat("Full dataset:", nrow(avilist_2025), "records,", ncol(avilist_2025), "fields\n")
cat("Short dataset:", nrow(avilist_2025_short), "records,", ncol(avilist_2025_short), "fields\n")

## ----explore-basic------------------------------------------------------------
# Count records by taxonomic rank
avilist_2025_short %>%
  count(Taxon_rank, sort = TRUE)

## ----explore-taxonomy---------------------------------------------------------
# Count species by order (top 10)
species_by_order <- avilist_2025_short %>%
  filter(Taxon_rank == "species") %>%
  count(Order, sort = TRUE) %>%
  head(10)

print(species_by_order)

## ----plot-orders, fig.width=8, fig.height=5-----------------------------------
# Visualize most diverse orders
ggplot(species_by_order, aes(x = reorder(Order, n), y = n)) +
  geom_col(fill = "steelblue", alpha = 0.8) +
  coord_flip() +
  labs(
    title = "Most Species-Rich Bird Orders",
    subtitle = "Top 10 orders by number of species",
    x = "Order",
    y = "Number of Species",
    caption = "Data: AviList Global Avian Checklist v2025"
  ) +
  theme_minimal()

## ----explore-families---------------------------------------------------------
# Most diverse bird families
family_richness <- avilist_2025_short %>%
  filter(Taxon_rank == "species") %>%
  count(Family, Family_English_name, sort = TRUE) %>%
  head(15)

print(family_richness)

## ----plot-families, fig.width=10, fig.height=6--------------------------------
# Visualize family diversity
ggplot(family_richness, aes(x = reorder(Family_English_name, n), y = n)) +
  geom_col(fill = "darkgreen", alpha = 0.8) +
  coord_flip() +
  labs(
    title = "Most Species-Rich Bird Families",
    subtitle = "Top 15 families by number of species",
    x = "Family",
    y = "Number of Species",
    caption = "Data: AviList Global Avian Checklist v2025"
  ) +
  theme_minimal() +
  theme(axis.text.y = element_text(size = 10))

## ----filter-examples----------------------------------------------------------
# Get all thrush species
thrushes <- avilist_2025_short %>%
  filter(Family == "Turdidae", Taxon_rank == "species") %>%
  select(Scientific_name, English_name_AviList)

cat("Number of thrush species:", nrow(thrushes), "\n")
head(thrushes)

## ----filter-raptors-----------------------------------------------------------
# Get all raptors (birds of prey)
raptor_families <- c("Accipitridae", "Falconidae", "Strigidae", "Tytonidae")

raptors <- avilist_2025_short %>%
  filter(Family %in% raptor_families, Taxon_rank == "species") %>%
  count(Family, Family_English_name, sort = TRUE)

print(raptors)

## ----pattern-matching---------------------------------------------------------
# Find species with "Robin" in their name
robins <- avilist_2025_short %>%
  filter(str_detect(English_name_AviList, "Robin"), Taxon_rank == "species") %>%
  select(Scientific_name, English_name_AviList, Family) %>%
  arrange(Family)

print(robins)

## ----genus-search-------------------------------------------------------------
# Explore a specific genus (Turdus)
turdus_species <- avilist_2025_short %>%
  filter(str_detect(Scientific_name, "^Turdus "), Taxon_rank == "species") %>%
  select(Scientific_name, English_name_AviList) %>%
  arrange(Scientific_name)

cat("Number of Turdus species:", nrow(turdus_species), "\n")
head(turdus_species, 10)

## ----data-quality-------------------------------------------------------------
# Summary of data completeness
data_completeness <- avilist_2025 %>%
  summarise(
    total_records = n(),
    missing_scientific_names = sum(is.na(Scientific_name)),
    missing_families = sum(is.na(Family)),
    missing_orders = sum(is.na(Order)),
    missing_avilist_names = sum(is.na(English_name_AviList))
  )

print(data_completeness)

## ----name-comparison----------------------------------------------------------
# Compare AviList vs Clements naming
name_comparison <- avilist_2025 %>%
  filter(Taxon_rank == "species") %>%
  summarise(
    total_species = n(),
    has_avilist_name = sum(!is.na(English_name_AviList)),
    has_clements_name = sum(!is.na(English_name_Clements_v2024)),
    has_both_names = sum(!is.na(English_name_AviList) & !is.na(English_name_Clements_v2024)),
    names_differ = sum(English_name_AviList != English_name_Clements_v2024, na.rm = TRUE)
  )

print(name_comparison)

## ----name-differences---------------------------------------------------------
# Examples where names differ between sources
name_differences <- avilist_2025 %>%
  filter(
    Taxon_rank == "species",
    !is.na(English_name_AviList),
    !is.na(English_name_Clements_v2024),
    English_name_AviList != English_name_Clements_v2024
  ) %>%
  select(Scientific_name, English_name_AviList, English_name_Clements_v2024) %>%
  head(10)

print(name_differences)

## ----performance-tips---------------------------------------------------------
# For large analyses, use the short dataset when possible
system.time({
  short_analysis <- avilist_2025_short %>%
    filter(Taxon_rank == "species") %>%
    count(Order)
})

# Filter early to reduce data size
songbirds <- avilist_2025_short %>%
  filter(Order == "Passeriformes", Taxon_rank == "species")

cat("Songbird species:", nrow(songbirds), "\n")

# Select only needed columns to reduce memory usage
essential_fields <- avilist_2025 %>%
  select(Scientific_name, English_name_AviList, Family, Order, Taxon_rank)

cat("Memory usage reduced from", ncol(avilist_2025), "to", ncol(essential_fields), "columns\n")

## ----taxize-example, eval=FALSE-----------------------------------------------
# library(taxize)
# 
# # Get a sample of species for validation
# sample_species <- avilist_2025_short %>%
#   filter(Family == "Turdidae", Taxon_rank == "species") %>%
#   pull(Scientific_name) %>%
#   head(5)
# 
# # Validate names with GBIF (commented out to avoid API calls in vignette)
# # gbif_validation <- get_gbifid(sample_species)

## ----rebird-example, eval=FALSE-----------------------------------------------
# library(rebird)
# 
# # Get Cornell Lab species codes from full dataset
# thrush_codes <- avilist_2025 %>%
#   filter(Family == "Turdidae", Taxon_rank == "species") %>%
#   select(Scientific_name, Species_code_Cornell_Lab) %>%
#   filter(!is.na(Species_code_Cornell_Lab))
# 
# # Example: Get recent observations (commented out to avoid API calls)
# # recent_thrushes <- ebirdregion("US-NY", species = thrush_codes$Species_code_Cornell_Lab[1])

## ----taxonomic-patterns-------------------------------------------------------
# Find monotypic genera (genera with only one species)
monotypic_genera <- avilist_2025_short %>%
  filter(Taxon_rank == "species") %>%
  mutate(genus = str_extract(Scientific_name, "^[A-Z][a-z]+")) %>%
  count(genus, Family) %>%
  filter(n == 1) %>%
  arrange(Family)

cat("Number of monotypic genera:", nrow(monotypic_genera), "\n")

# Genera per family
monotypic_summary <- monotypic_genera %>%
  count(Family, name = "monotypic_genera") %>%
  arrange(desc(monotypic_genera)) %>%
  head(10)

print(monotypic_summary)

## ----geographic-analysis------------------------------------------------------
# Analyze type localities (where species were first described)
type_localities <- avilist_2025 %>%
  filter(Taxon_rank == "species", !is.na(Type_locality)) %>%
  mutate(
    continent = case_when(
      str_detect(Type_locality, regex("Australia|New Zealand", ignore_case = TRUE)) ~ "Australasia",
      str_detect(Type_locality, regex("Europe|European", ignore_case = TRUE)) ~ "Europe",
      str_detect(Type_locality, regex("Africa|African", ignore_case = TRUE)) ~ "Africa",
      str_detect(Type_locality, regex("Asia|Asian|China|Japan|India", ignore_case = TRUE)) ~ "Asia",
      str_detect(Type_locality, regex("America|Brazil|Peru|Mexico|Canada|USA", ignore_case = TRUE)) ~ "Americas",
      TRUE ~ "Other"
    )
  ) %>%
  count(continent, sort = TRUE)

print(type_localities)

## ----metadata-exploration-----------------------------------------------------
# Understand the available fields
print(avilist_metadata)

# Fields available in short vs full dataset
cat("Fields in short dataset:\n")
short_fields <- avilist_metadata %>%
  filter(in_short_version) %>%
  pull(field_name)
cat(paste(short_fields, collapse = ", "), "\n\n")

cat("Additional fields in full dataset:\n")
full_only_fields <- avilist_metadata %>%
  filter(in_full_version & !in_short_version) %>%
  pull(field_name)
cat(paste(full_only_fields, collapse = ", "), "\n")