% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/lda.R, R/seededlda.R
\name{textmodel_lda}
\alias{textmodel_lda}
\alias{textmodel_seededlda}
\title{Semisupervised Latent Dirichlet allocation}
\usage{
textmodel_lda(
  x,
  k = 10,
  max_iter = 2000,
  alpha = NULL,
  beta = NULL,
  model = NULL,
  verbose = quanteda_options("verbose")
)

textmodel_seededlda(
  x,
  dictionary,
  valuetype = c("glob", "regex", "fixed"),
  case_insensitive = TRUE,
  residual = FALSE,
  weight = 0.01,
  max_iter = 2000,
  alpha = NULL,
  beta = NULL,
  ...,
  verbose = quanteda_options("verbose")
)
}
\arguments{
\item{x}{the dfm on which the model will be fit}

\item{k}{the number of topics; determined automatically by the number of keys
in \code{dictionary} in \code{textmodel_seededlda()}.}

\item{max_iter}{the maximum number of iteration in Gibbs sampling.}

\item{alpha}{the value to smooth topic-document distribution; defaults to
\code{alpha = 50 / k}.}

\item{beta}{the value to smooth topic-word distribution; defaults to \code{beta = 0.1}.}

\item{model}{a fitted LDA model; if provided, \code{textmodel_lda()} inherits
parameters from an existing model. See details.}

\item{verbose}{logical; if \code{TRUE} print diagnostic information during
fitting.}

\item{dictionary}{a \code{\link[quanteda:dictionary]{quanteda::dictionary()}} with seed words that define
topics.}

\item{valuetype}{see \link[quanteda:valuetype]{quanteda::valuetype}}

\item{case_insensitive}{see \link[quanteda:valuetype]{quanteda::valuetype}}

\item{residual}{if \code{TRUE} a residual topic (or "garbage topic") will be
added to user-defined topics.}

\item{weight}{pseudo count given to seed words as a proportion of total
number of words in \code{x}.}

\item{...}{passed to \link[quanteda:dfm_trim]{quanteda::dfm_trim} to restrict seed words based on
their term or document frequency. This is useful when glob patterns in the
dictionary match too many words.}
}
\value{
\code{textmodel_seededlda()} and \code{textmodel_lda()} returns a list of model
parameters. \code{theta} is the distribution of topics over documents; \code{phi} is
the distribution of words over topics. \code{alpha} and \code{beta} are the small
constant added to the frequency of words to estimate \code{theta} and \code{phi},
respectively, in Gibbs sampling. Other elements in the list subject to
change.
}
\description{
\code{textmodel_seededlda()} implements semisupervised Latent Dirichlet allocation
(seeded-LDA). The estimator's code adopted from the GibbsLDA++ library
(Xuan-Hieu Phan, 2007). \code{textmodel_seededlda()} allows identification of
pre-defined topics by semisupervised learning with a seed word dictionary.
}
\details{
To predict topics of new documents (i.e. out-of-sample), first,
create a new LDA model from a existing LDA model passed to \code{model} in
\code{textmodel_lda()}; second, apply \code{\link[=topics]{topics()}} to the new model. The \code{model}
argument takes objects created either by \code{textmodel_lda()} or
\code{textmodel_seededlda()}.
}
\examples{
\donttest{
require(seededlda)
require(quanteda)

data("data_corpus_moviereviews", package = "quanteda.textmodels")
corp <- head(data_corpus_moviereviews, 500)
toks <- tokens(corp, remove_punct = TRUE, remove_symbols = TRUE, remove_number = TRUE)
dfmt <- dfm(toks) \%>\%
    dfm_remove(stopwords('en'), min_nchar = 2) \%>\%
    dfm_trim(min_termfreq = 0.90, termfreq_type = "quantile",
             max_docfreq = 0.1, docfreq_type = "prop")

# unsupervised LDA
lda <- textmodel_lda(head(dfmt, 450), 6)
terms(lda)
topics(lda)
lda2 <- textmodel_lda(tail(dfmt, 50), model = lda) # new documents
topics(lda2)

# semisupervised LDA
dict <- dictionary(list(people = c("family", "couple", "kids"),
                        space = c("alien", "planet", "space"),
                        moster = c("monster*", "ghost*", "zombie*"),
                        war = c("war", "soldier*", "tanks"),
                        crime = c("crime*", "murder", "killer")))
slda <- textmodel_seededlda(dfmt, dict, residual = TRUE, min_termfreq = 10)
terms(slda)
topics(slda)

}
}
\references{
Lu, Bin et al. (2011). "Multi-aspect Sentiment Analysis with
Topic Models". doi:10.5555/2117693.2119585. \emph{Proceedings of the 2011 IEEE
11th International Conference on Data Mining Workshops}.

Watanabe, Kohei & Zhou, Yuan (2020). "Theory-Driven Analysis of Large
Corpora: Semisupervised Topic Classification of the UN Speeches".
doi:10.1177/0894439320907027. \emph{Social Science Computer Review}.
}
\seealso{
\link[topicmodels:lda]{topicmodels}
}
\keyword{textmodel}
