## ----setup, include = FALSE---------------------------------------------------
knitr::opts_chunk$set(collapse = TRUE, 
                      warning = TRUE,
                      comment = "#>")
run_vignette <- requireNamespace("GSEABase", quietly = TRUE) &&
    requireNamespace("GO.db", quietly = TRUE) &&  
    requireNamespace("reactome.db", quietly = TRUE) && 
    requireNamespace("org.Hs.eg.db", quietly = TRUE) &&  
    requireNamespace("ggplot2", quietly = TRUE) &&
    requireNamespace("forcats", quietly = TRUE)

## ----setupr, message=FALSE----------------------------------------------------
library("BaseSet", quietly = TRUE)
library("dplyr", quietly = TRUE)

## ----prepare_GO, message=FALSE, eval=run_vignette-----------------------------
# We load some libraries
library("org.Hs.eg.db", quietly = TRUE)
library("GO.db", quietly = TRUE)
library("ggplot2", quietly = TRUE)
# Prepare the data 
h2GO_TS <- tidySet(org.Hs.egGO)
h2GO <- as.data.frame(org.Hs.egGO)

## ----evidence_ontology, eval=run_vignette, fig.alt="Horizontal bar plot with the number of evidence code per ontology."----
library("forcats", include.only = "fct_reorder2", quietly = TRUE)
h2GO %>% 
    group_by(Evidence, Ontology) %>% 
    count(name = "Freq") %>% 
    ungroup() %>% 
    mutate(Evidence = fct_reorder2(Evidence, Ontology, -Freq),
           Ontology = case_match(Ontology,
                                 "CC" ~ "Cellular Component",
                                 "MF" ~ "Molecular Function",
                                 "BP" ~ "Biological Process",
                                 .default = NA)) %>% 
    ggplot() +
    geom_col(aes(Evidence, Freq)) +
    facet_grid(~Ontology) + 
    theme_minimal() +
    coord_flip() +
    labs(x = element_blank(), y = element_blank(),
         title = "Evidence codes for each ontology")

## ----nEvidence_plot, eval=run_vignette, fig.alt="Bar plot with the number of relationships that with a given number of evidences: most only have 1 evidence code but some have up to 7"----
h2GO_TS %>% 
    relations() %>% 
    group_by(elements, sets) %>% 
    count(sort = TRUE, name = "Annotations") %>% 
    ungroup() %>% 
    count(Annotations, sort = TRUE) %>% 
    ggplot() +
    geom_col(aes(Annotations, n)) +
    theme_minimal() +
    labs(x = "Evidence codes", y = "Annotations", 
         title = "Evidence codes for each annotation",
         subtitle = "in human") +
    scale_x_continuous(breaks = 1:7)

## ----numbers, eval=run_vignette-----------------------------------------------
# Add all the genes and GO terms
h2GO_TS <- add_elements(h2GO_TS, keys(org.Hs.eg.db)) %>% 
    add_sets(grep("^GO:", keys(GO.db), value = TRUE))

sizes_element <- element_size(h2GO_TS) %>% 
    arrange(desc(size))
sum(sizes_element$size == 0)
sum(sizes_element$size != 0)

sizes_set <- set_size(h2GO_TS) %>% 
    arrange(desc(size))
sum(sizes_set$size == 0)
sum(sizes_set$size != 0)

## ----plots_GO, eval=run_vignette, fig.alt=c("Histogram of number of sets per element: there are many genes on many ontologies.", "Histogram of number of elements per set: There is one set that is huge but then there many than have few elements.")----
sizes_element %>% 
    filter(size != 0) %>% 
    ggplot() +
    geom_histogram(aes(size), binwidth = 1) +
    theme_minimal() +
    labs(x = "# sets per element", y = "Count")

sizes_set %>% 
    filter(size != 0) %>% 
    ggplot() +
    geom_histogram(aes(size), binwidth = 1) +
    theme_minimal() +
    labs(x = "# elements per set", y = "Count")

## ----distr_sizes, eval=run_vignette-------------------------------------------
head(sizes_set, 10)

## ----fuzzy_setup, eval=run_vignette-------------------------------------------
nr <- h2GO_TS %>% 
    relations() %>% 
    dplyr::select(sets, Evidence) %>% 
    distinct() %>% 
    mutate(fuzzy = case_match(Evidence,
                              "EXP" ~ 0.9,
                              "IDA" ~ 0.8,
                              "IPI" ~ 0.8,
                              "IMP" ~ 0.75,
                              "IGI" ~ 0.7,
                              "IEP" ~ 0.65,
                              "HEP" ~ 0.6,
                              "HDA" ~ 0.6,
                              "HMP" ~ 0.5,
                              "IBA" ~ 0.45,
                              "ISS" ~ 0.4,
                              "ISO" ~ 0.32,
                              "ISA" ~ 0.32,
                              "ISM" ~ 0.3,
                              "RCA" ~ 0.2,
                              "TAS" ~ 0.15,
                              "NAS" ~ 0.1,
                              "IC" ~ 0.02,
                              "ND" ~ 0.02,
                              "IEA" ~ 0.01,
                              .default = 0.01)) %>% 
    dplyr::select(sets = "sets", elements = "Evidence", fuzzy = fuzzy)

## ----fuzzy_setup2, eval=run_vignette------------------------------------------
ts <- h2GO_TS %>% 
    relations() %>% 
    dplyr::select(-Evidence) %>% 
    rbind(nr) %>% 
    tidySet() %>% 
    mutate_element(Type = ifelse(grepl("^[0-9]+$", elements), "gene", "evidence"))

## ----cardinality, eval=run_vignette-------------------------------------------
ts %>% 
    dplyr::filter(Type != "Gene") %>% 
    cardinality() %>% 
    arrange(desc(cardinality)) %>% 
    head()

## ----size_go, eval=run_vignette-----------------------------------------------
ts %>% 
    filter(sets %in% c("GO:0008152", "GO:0003674", "GO:0005575"),
           Type != "gene") %>% 
    set_size()

## ----evidence_go, eval=run_vignette-------------------------------------------
go_terms <- c("GO:0008152", "GO:0003674", "GO:0005575")
ts %>% 
    filter(sets %in% go_terms & Type != "gene") 

## ----prepare_reactome, eval=run_vignette--------------------------------------
# We load some libraries
library("reactome.db")

# Prepare the data (is easier, there isn't any ontoogy or evidence column)
h2p <- as.data.frame(reactomeEXTID2PATHID)
colnames(h2p) <- c("sets", "elements")
# Filter only for human pathways
h2p <- h2p[grepl("^R-HSA-", h2p$sets), ]

# There are duplicate relations with different evidence codes!!: 
summary(duplicated(h2p[, c("elements", "sets")]))
h2p <- unique(h2p)
# Create a TidySet and 
h2p_TS <- tidySet(h2p) %>% 
    # Add all the genes 
    add_elements(keys(org.Hs.eg.db))

## ----numbers_pathways, eval=run_vignette--------------------------------------
sizes_element <- element_size(h2p_TS) %>% 
    arrange(desc(size))
sum(sizes_element$size == 0)
sum(sizes_element$size != 0)

sizes_set <- set_size(h2p_TS) %>% 
    arrange(desc(size))

## ----pathways_plots, eval=run_vignette, fig.alt=c("Genes per pathway.", "Pathway per gene.")----
sizes_element %>% 
    filter(size != 0) %>% 
    ggplot() +
    geom_histogram(aes(size), binwidth = 1) +
    scale_y_log10() +
    theme_minimal() +
    labs(x = "# sets per element", y = "Count")

sizes_set %>% 
    ggplot() +
    geom_histogram(aes(size), binwidth = 1) +
    scale_y_log10() +
    theme_minimal() +
    labs(x = "# elements per set", y = "Count")

## ----distr_sizes_pathways, eval=run_vignette----------------------------------
head(sizes_set, 10)

## ----sessionInfo, echo=FALSE--------------------------------------------------
sessionInfo()