% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/genDataPreprocess.R
\name{genDataPreprocess}
\alias{genDataPreprocess}
\title{Pre-processing of the genetic data}
\usage{

  genDataPreprocess(data.in = stop("You have to give the object to preprocess!"),
  map.file, design = "triad", file.out = "data_preprocessed",
  dir.out = ".", ncpu = 1, overwrite = NULL)
}
\arguments{
\item{data.in}{Input data, as loaded by \link{genDataRead} or \link{genDataLoad}.}

\item{map.file}{Filename (with path if the file is not in current directory) of the
.map file holding the SNP names, if available.}

\item{design}{The design used in the study - choose from:
\itemize{
  \item \emph{triad} - (default), data includes genotypes of mother, father and child;
  \item \emph{cc} - classical case-control;
  \item \emph{cc.triad} - hybrid design: triads with cases and controls
}.}

\item{file.out}{The core name of the files that will contain the preprocessed data
(character string); ready to load next time with \link{genDataLoad} function;
default: "data_preprocessed".}

\item{dir.out}{The directory that will contain the saved data; defaults to current 
working directory.}

\item{ncpu}{The number of CPU cores to use - this speeds up the process for large 
datasets significantly. Default is 1 core, maximum is 1 less than the total number
of cores available on a current machine (even if the number given by the user is
more than that).}

\item{overwrite}{Whether to overwrite the output files: if NULL (default), will prompt
the user to give answer; set to TRUE, will automatically overwrite any existing files;
and set to FALSE, will stop if the output files exist.}
}
\value{
A list object with three elements:
  \itemize{
    \item \emph{cov.data} - a \code{data.frame} with covariate data (if available in
       the input file)
    \item \emph{gen.data} - a list with chunks of the genetic data; the data is divided
       column-wise, using 10,000 columns per chunk; each element of this list is a
       \link[ff]{ff} matrix
    \item \emph{aux} - a list with meta-data and important parameters:
    \itemize{
      \item \emph{variables} - tabulated information of the covariate data;
      \item \emph{variables.nas} - how many NA values per each column of covariate data;
      \item \emph{alleles} - all the possible alleles in each marker;
      \item \emph{alleles.nas} - how many NA values in each marker;
      \item \emph{nrows.with.missing} - how many rows contain any missing allele information;
      \item \emph{which.rows.with.missing} - vector of indices of rows with missing data (if any)
    }.
  }
}
\description{
This function prepares the data to be used in Haplin analysis
}
\section{Details}{

The .map file should contain at least two columns, where the second one contains SNP 
  names. Any additional columns should be separated by a whitespace character, but will 
  be ignored. The file should contain a header.
}

\examples{
  # The argument 'overwrite' is set to TRUE!
  # First, read the data:
  examples.dir <- system.file( "extdata", package = "Haplin" )
  example.file <- file.path( examples.dir, "exmpl_data.ped" )
  ped.data.read <- genDataRead( example.file, file.out = "exmpl_ped_data", 
   dir.out = tempdir( check = TRUE ), format = "ped", overwrite = TRUE )
  ped.data.read
  # Take only part of the data (if needed)
  ped.data.part <- genDataGetPart( ped.data.read, design = "triad", markers = 10:12,
   dir.out = tempdir( check = TRUE ), file.out = "exmpl_ped_data_part", overwrite = TRUE )
  # Preprocess as "triad" data:
  ped.data.preproc <- genDataPreprocess( ped.data.part, design = "triad",
   dir.out = tempdir( check = TRUE ), file.out = "exmpl_data_preproc", overwrite = TRUE )
  ped.data.preproc

}
