% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/dummy_multi_choice.R
\name{step_dummy_multi_choice}
\alias{step_dummy_multi_choice}
\title{Handle levels in multiple predictors together}
\usage{
step_dummy_multi_choice(
  recipe,
  ...,
  role = "predictor",
  trained = FALSE,
  threshold = 0,
  levels = NULL,
  input = NULL,
  other = "other",
  naming = dummy_names,
  prefix = NULL,
  keep_original_cols = FALSE,
  skip = FALSE,
  id = rand_id("dummy_multi_choice")
)
}
\arguments{
\item{recipe}{A recipe object. The step will be added to the
sequence of operations for this recipe.}

\item{...}{One or more selector functions to choose variables for this step.
See \code{\link[=selections]{selections()}} for more details. The selected variables \emph{must} be
factors.}

\item{role}{For model terms created by this step, what analysis role should
they be assigned? By default, the new columns created by this step from
the original variables will be used as \emph{predictors} in a model.}

\item{trained}{A logical to indicate if the quantities for
preprocessing have been estimated.}

\item{threshold}{A numeric value between 0 and 1, or an integer greater or
equal to one.  If less than one, then factor levels with a rate of
occurrence in the training set below \code{threshold} will be pooled to \code{other}.
If greater or equal to one, then this value is treated as a frequency
and factor levels that occur less than \code{threshold} times will be pooled
to \code{other}.}

\item{levels}{A list that contains the information needed to create dummy
variables for each variable contained in \code{terms}. This is \code{NULL} until the
step is trained by \code{\link[=prep]{prep()}}.}

\item{input}{A character vector containing the names of the columns used.
This is \code{NULL} until the step is trained by \code{\link[=prep]{prep()}}.}

\item{other}{A single character value for the "other" category.}

\item{naming}{A function that defines the naming convention for new dummy
columns. See Details below.}

\item{prefix}{A character string for the prefix of the resulting new
variables. See notes below.}

\item{keep_original_cols}{A logical to keep the original variables in the
output. Defaults to \code{FALSE}.}

\item{skip}{A logical. Should the step be skipped when the
recipe is baked by \code{\link[=bake]{bake()}}? While all operations are baked
when \code{\link[=prep]{prep()}} is run, some operations may not be able to be
conducted on new data (e.g. processing the outcome variable(s)).
Care should be taken when using \code{skip = TRUE} as it may affect
the computations for subsequent operations.}

\item{id}{A character string that is unique to this step to identify it.}
}
\value{
An updated version of \code{recipe} with the new step added to the
sequence of any existing operations.
}
\description{
\code{step_dummy_multi_choice()} creates a \emph{specification} of a recipe step that
will convert multiple nominal data (e.g. characters or factors) into one or
more numeric binary model terms for the levels of the original data.
}
\details{
The overall proportion (or total counts) of the categories are
computed. The "other" category is used in place of any categorical levels
whose individual proportion (or frequency) in the training set is less than
\code{threshold}.

This recipe step allows for flexible naming of the resulting
variables. For an unordered factor named \code{x}, with levels \code{"a"}
and \code{"b"}, the default naming convention would be to create a
new variable called \code{x_b}. The naming format can be changed using
the \code{naming} argument; the function \code{\link[=dummy_names]{dummy_names()}} is the
default.
}
\section{Tuning Parameters}{
This step has 1 tuning parameters:
\itemize{
\item \code{threshold}: Threshold (type: double, default: 0)
}
}

\section{Tidying}{
When you \code{\link[=tidy.recipe]{tidy()}} this step, a tibble is returned with
columns \code{terms}, \code{columns} , and \code{id}:

\describe{
\item{terms}{character, the selectors or variables selected}
\item{columns}{character, names of resulting columns}
\item{id}{character, id of this step}
}
}

\section{Case weights}{


The underlying operation does not allow for case weights.
}

\examples{
library(tibble)
languages <- tribble(
  ~lang_1,    ~lang_2,   ~lang_3,
  "English",  "Italian", NA,
  "Spanish",  NA,        "French",
  "Armenian", "English", "French",
  NA,         NA,        NA
)

dummy_multi_choice_rec <- recipe(~., data = languages) \%>\%
  step_dummy_multi_choice(starts_with("lang")) \%>\%
  prep()

bake(dummy_multi_choice_rec, new_data = NULL)
tidy(dummy_multi_choice_rec, number = 1)

dummy_multi_choice_rec2 <- recipe(~., data = languages) \%>\%
  step_dummy_multi_choice(starts_with("lang"),
    prefix = "lang",
    threshold = 0.2
  ) \%>\%
  prep()

bake(dummy_multi_choice_rec2, new_data = NULL)
tidy(dummy_multi_choice_rec2, number = 1)
}
\seealso{
Other dummy variable and encoding steps: 
\code{\link{step_bin2factor}()},
\code{\link{step_count}()},
\code{\link{step_date}()},
\code{\link{step_dummy_extract}()},
\code{\link{step_dummy}()},
\code{\link{step_factor2string}()},
\code{\link{step_holiday}()},
\code{\link{step_indicate_na}()},
\code{\link{step_integer}()},
\code{\link{step_novel}()},
\code{\link{step_num2factor}()},
\code{\link{step_ordinalscore}()},
\code{\link{step_other}()},
\code{\link{step_regex}()},
\code{\link{step_relevel}()},
\code{\link{step_string2factor}()},
\code{\link{step_time}()},
\code{\link{step_unknown}()},
\code{\link{step_unorder}()}
}
\concept{dummy variable and encoding steps}
