% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/gclink.R
\name{gclink}
\alias{gclink}
\title{Gene-Cluster Discovery, Annotation and Visualization}
\usage{
gclink(
  in_blastp_df = blastp_df,
  in_seq_data = seq_data,
  in_gene_list = photosynthesis_gene_list,
  in_GC_group = PGC_group,
  AllGeneNum = 50,
  MinConSeq = 25,
  apply_length_filter = FALSE,
  down_IQR = 10,
  up_IQR = 10,
  apply_evalue_filter = FALSE,
  min_evalue = 1,
  apply_score_filter = FALSE,
  min_score = 10,
  orf_before_first = 0,
  orf_after_last = 0,
  orf_range = "All",
  levels_gene_group = c("bch", "puh", "puf", "crt", "acsF", "assembly", "regulator",
    "hypothetical ORF"),
  color_theme = c("#3BAA51", "#6495ED", "#DD2421", "#EF9320", "#F8EB00", "#FF0683",
    "#956548", "grey"),
  genome_subset = NULL
)
}
\arguments{
\item{in_blastp_df}{A \code{data.frame} from blast/blastp-style output with columns:
\itemize{
\item \code{qaccver}: Genome + contig name (separated by \code{"---"}), e.g.,
\code{"Kuafubacteriaceae--GCA_016703535.1---JADJBV010000001.1_150"}:
\itemize{
\item Genome: \code{"Kuafubacteriaceae--GCA_016703535.1"} (\code{"--"} separator),
\item Contig: \code{"JADJBV010000001.1"},
\item ORF: \code{"JADJBV010000001.1_150"} (\code{"_"} separator),
\item Position: \code{"150"}.
}
\item \code{saccver}: Gene name + metadata (separated by \code{"_"}), e.g.,
\code{"bchC_Methyloversatilis_sp_RAC08_BSY238_2447_METR"}:
\itemize{
\item Gene: \code{"bchC"},
\item Metadata(Optional): \code{"Methyloversatilis_sp_RAC08_BSY238_2447_METR"}.
}
}
eggNOG (evolutionary gene genealogy Nonsupervised Orthologous Groups) format results
are supported by renaming annotation columns (e.g., \code{"GOs"}) to \code{saccver}.
Default: \code{blastp_df}.}

\item{in_seq_data}{\code{NULL} or a \code{data.frame} with:
\describe{
\item{\code{SeqName}}{ORF identifier (Prodigal format: \verb{>ORF_id # start # end # strand # ...}).}
\item{\code{Sequence}}{ORF sequence.}
}
Example:
\code{"Kuafubacteriaceae--GCA_016703535.1---JADJBV010000001.1_1 # 74 # 1018 # 1 # ..."}
Can be imported from \strong{Prodigal} FASTA using:

\if{html}{\out{<div class="sourceCode r">}}\preformatted{seq_data <- Biostrings::readBStringSet("Prodigal.fasta",format="fasta", nrec=-1L, skip=0L, seek.first.rec=FALSE, use.names=TRUE) \%>\%
  data.frame(Sequence = .) \%>\%
  tibble::rownames_to_column("SeqName")
}\if{html}{\out{</div>}}

\code{NULL} skips coordinate extraction and plotting. Default: \code{seq_data}.}

\item{in_gene_list}{Character vector of reference genes for cluster detection.
Default: \code{photosynthesis_gene_list}.}

\item{in_GC_group}{\code{NULL} or a \code{data.frame} mapping genes to functional groups
(columns: \code{gene}, \code{gene_group}). \code{NULL} skips plotting.
Default: \code{PGC_group}.}

\item{AllGeneNum}{Integer; max ORFs per cluster. Default: \code{50}.}

\item{MinConSeq}{Integer; min consecutive reference genes per cluster. Default: \code{25}.}

\item{apply_length_filter}{Logical; whether to apply length filtering based on IQR.
If \code{FALSE}, skips \code{down_IQR} and \code{up_IQR} filtering.
Default: \code{FALSE}.}

\item{down_IQR}{Numeric; lower-bound scale factor for IQR length
filtering (see \code{length_filter}). Default: \code{10}.}

\item{up_IQR}{Numeric; upper-bound scale factor for IQR length
filtering (see \code{length_filter}). Default: \code{10}.}

\item{apply_evalue_filter}{Logical; whether to filter BLAST hits by e-value cutoff.
Requires column \code{evalue} in input data. Default: \code{FALSE}.}

\item{min_evalue}{Numeric; maximum e-value threshold for retaining BLAST hits.
Hits with e-value > cutoff will be removed.
Typical values: 1e-3 (loose) to 1e-10 (strict).
Default: \code{1}.}

\item{apply_score_filter}{Logical; whether to filter BLAST hits by bitscore cutoff.
Requires column \code{bitscore} in input data. Default: \code{FALSE}.}

\item{min_score}{Numeric; minimum bitscore threshold for retaining BLAST hits.
Hits with bitscore < cutoff will be removed.
For short proteins (~100 aa), 30-40 is typical;
for long proteins (>300 aa), 50-100 is recommended.
Default: \code{10}.}

\item{orf_before_first}{Integer; number of hypothetical ORFs to insert
\strong{before the first gene} of each detected cluster.
Useful when the upstream annotation is incomplete or
low-confidence; insertion is bounded by the first ORF
of the contig. Default: \code{0}.}

\item{orf_after_last}{Integer; number of hypothetical ORFs to append
\strong{after the last gene} of each detected cluster.
Helpful when the downstream annotation is incomplete
or low-confidence; insertion is bounded by the last
ORF of the contig. Default: \code{0}.}

\item{orf_range}{Character. ORF inclusion mode:
\itemize{
\item \code{"All"}: Include all ORFs + annotations (default)
\item \code{"OnlyAnnotated"}: Keep only annotated ORFs
\item \code{"IgnoreAnnotated"}: Include all ORFs without annotation merging
}}

\item{levels_gene_group}{Character vector; factor levels for gene-group
legends (must include "hypothetical ORF" in case
some genes remain unclassified).
Ignored if \code{in_GC_group} is \code{NULL}.
Default: \code{c('bch','puh','puf','crt','acsF','assembly','regulator','hypothetical ORF')}.}

\item{color_theme}{Character vector; colours for \code{gc_plot}, matched to
the length and order of \code{levels_gene_group}.
Ignored if plotting is skipped.
Default: \code{c('#3BAA51','#6495ED','#DD2421','#EF9320','#F8EB00','#FF0683','#956548','grey')}.}

\item{genome_subset}{Character vector or \code{NULL}; genomes to retain.  If \code{NULL}, all genomes
are retained. Default: \code{NULL}.}
}
\value{
A named list with:
\describe{
\item{\code{GC_meta}}{Annotated cluster table (\code{data.frame}).}
\item{\code{GC_seq}}{FASTA sequences (if \code{in_seq_data} provided).}
\item{\code{GC_plot}}{ggplot object (if \code{in_seq_data} and \code{in_GC_group} provided).}
}
}
\description{
\code{gclink()} performs \strong{all steps} from raw blastp output to a publication-ready
gene-cluster plot \strong{in one call}. It dynamically adapts its workflow:
\itemize{
\item If \code{in_seq_data} is provided: Extracts coordinates/sequences, merges data, and generates plots.
\item If \code{in_seq_data} is \code{NULL}: Skips sequence-dependent steps, returning only the cluster table.
\item If \code{in_GC_group} is \code{NULL}: Omits functional-group merging and plotting.
\item Supports custom gene lists (e.g., photosynthesis, viral genes) via \code{in_gene_list} for universal
cluster detection in bacteria, archaea, or phages.
}

Gene clusters are identified via a density threshold (\code{AllGeneNum} and \code{MinConSeq}) due to
incomplete gene annotation coverage.
}
\examples{
   # Case 1: Using blastp result with Full pipeline (Find Cluster + Extract FASTA + Plot Cluster)
   data(blastp_df)
   data(seq_data)
   data(photosynthesis_gene_list)
   data(PGC_group)
   gc_list <- gclink(in_blastp_df = blastp_df,
                     in_seq_data = seq_data,
                     in_gene_list = photosynthesis_gene_list,
                     in_GC_group  = PGC_group,
                     AllGeneNum = 50,
                     MinConSeq  = 25,
                     apply_length_filter = TRUE,
                     down_IQR   = 10,
                     up_IQR     = 10,
                     orf_before_first = 0,
                     orf_after_last = 0,
                     levels_gene_group = c('bch','puh','puf','crt','acsF','assembly','regulator',
                                           'hypothetical ORF'),
                     color_theme = c('#3BAA51','#6495ED','#DD2421','#EF9320','#F8EB00',
                                     '#FF0683','#956548','grey'),
                     genome_subset = NULL)
   gc_meta = gc_list[["GC_meta"]]
   gc_seq = gc_list[["GC_seq"]]
   gc_plot = gc_list[["GC_plot"]]
   head(gc_meta)
   head(gc_seq)
   print(gc_plot)

   # Case 2: Using eggNOG result with Full pipeline (Find Cluster + Extract FASTA + Plot Cluster)
   data(eggnog_df)
   data(seq_data)
   data(KO_group)
   KOs = c("K02291","K09844","K20611","K13789",
           "K09846","K08926","K08927","K08928",
           "K08929","K13991","K04035","K04039",
           "K11337","K03404","K11336","K04040",
           "K03403","K03405","K04037","K03428",
           "K04038","K06049","K10960","K11333",
           "K11334","K11335","K08226","K08226",
           "K09773")
   rename_KOs = paste0("ko:", KOs)
   eggnog_df$qaccver = eggnog_df$`#query`
   eggnog_df$saccver = eggnog_df$KEGG_ko
   eggnog_df$evalue = eggnog_df$evalue
   eggnog_df$bitscore = eggnog_df$score
   eggnog_df$gene = eggnog_df$KEGG_ko
   gc_list_2 = gclink(in_blastp_df = eggnog_df,
                      in_seq_data = seq_data,
                      in_gene_list = rename_KOs,
                      in_GC_group  = KO_group,
                      AllGeneNum = 50,
                      MinConSeq  = 25,
                      apply_evalue_filter = FALSE,
                      min_evalue = 1,
                      apply_score_filter = TRUE,
                      min_score = 10,
                      orf_before_first = 1,
                      orf_after_last = 1,
                      levels_gene_group = c('bch','puh','puf','crt',
                                            'acsF','assembly','hypothetical ORF'),
                      color_theme = c('#3BAA51','#6495ED','#DD2421','#EF9320',
                                      '#F8EB00','#FF0683','grey'))
   gc_meta_2 = gc_list_2[["GC_meta"]]
   gc_seq_2 = gc_list_2[["GC_seq"]]
   gc_plot_2 = gc_list_2[["GC_plot"]]
   head(gc_meta_2)
   head(gc_seq_2)
   print(gc_plot_2)

   # SOLUTION FOR "Viewport has zero dimension(s)" ERROR in print(gc_plot)
   # --------------------------------------------------
   # When you see this error, try these 3 solutions:

   # 1. RESIZE RSTUDIO PLOT PANEL (Quickest fix)
   # Simply click and drag the right border of the plot panel wider

   # 2. LAUNCH IN NEW WINDOW
   # dev.new()
   # print(gc_plot)

   # 3. SAVE DIRECTLY TO FILE
   # ggplot2::ggsave('gc_plot.pdf', gc_plot, width=20,height=3)
}
