\documentclass[article, shortnames, nojss]{jss}
\usepackage{amsmath, amsthm, amssymb, amscd, ifthen, subfigure, psfrag}
\renewcommand{\textfraction}{0.05}
\renewcommand{\topfraction}{0.95}
\renewcommand{\bottomfraction}{0.95}
\renewcommand{\floatpagefraction}{0.35}
\usepackage{afterpage}
\DeclareMathOperator*{\argmax}{arg\,max}
\DeclareMathOperator*{\argmin}{arg\,min}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% declarations for jss.cls %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%\VignetteIndexEntry{a guide to the LogConcDEAD package}
%\VignetteKeywords{LogConcDEAD}
%\VignetteDepends{LogConcDEAD, rgl, MASS, mvtnorm}
%\VignettePackage{LogConcDEAD}
%% almost as usual
\author{Madeleine Cule, Robert Gramacy and Richard Samworth\\
University of Cambridge}
\title{\pkg{LogConcDEAD}: An \proglang{R} Package for Maximum Likelihood
Estimation of a Multivariate Log-Concave Density}
%% for pretty printing and a nice hypersummary also set:
\Plainauthor{Madeleine Cule, Robert Gramacy, Richard
Samworth} %% comma-separated
\Plaintitle{LogConcDEAD: An R Package for Maximum Likelihood Estimation of
a Multivariate Log-Concave Density} %% without formatting
\Shorttitle{\pkg{LogConcDEAD}: Multivariate Log-Concave Density Estimation} %% a short title (if necessary)
%% an abstract and keywords
\Abstract{In this document we introduce the \proglang{R} package
\pkg{LogConcDEAD} (Log-concave density estimation in arbitrary
dimensions). Its main function is to compute the nonparametric
maximum likelihood estimator of a log-concave density. Functions
for plotting, sampling from the density estimate and evaluating the
density estimate are provided. All of the functions available in
the package are illustrated using simple, reproducible examples with
simulated data.} \Keywords{log-concave density, multivariate
density estimation, visualization, nonparametric statistics}
\Plainkeywords{log-concave density, multivariate density estimation,
visualization, nonparametric statistics}
\Address{
Madeleine Cule, Robert Gramacy, Richard Samworth\\
Statistical Laboratory\\
Centre for Mathematical Sciences\\
Wilberforce Road\\
Cambridge CB3 0WG\\
E-mail: \email{\{mlc40,bobby,rjs57\}@statslab.cam.ac.uk}\\
URL: \url{http://www.statslab.cam.ac.uk/~{mlc40,bobby,rjs57}}
}
%% end of declarations %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%My handy macros:
%KL divergence
\newcommand{\dkl}[2]{d_{KL}(#1 | #2)}
%Hellinger distance
\newcommand{\dhell}[2]{d_h(#1,#2)}
%Various stuff for proofs
\newcommand{\st}{\textrm{ subject to }}
\newcommand{\rarrow}{\rightarrow}
\newcommand{\convp}{\stackrel{p}{\rightarrow}}
\newcommand{\convas}{\stackrel{\textit{a.s.}}{\rightarrow}}
\newcommand{\convd}{\stackrel{d}{\rightarrow}}
\newcommand{\real}[1]{\mathbb{R}^{#1}}
\newcommand{\rn}{\mathbb{R}^n}
\newcommand{\rd}{\mathbb{R}^d}
\newcommand{\rk}{\mathbb{R}^k}
\newcommand{\epi}{\textrm{epi }}
\newcommand{\cl}{\textrm{cl }}
\newcommand{\conv}{\textrm{conv }}
\newcommand{\dom}{\textrm{dom }}
\newcommand{\supp}{\textrm{supp }}
%I prefer wide bars, hats etc and varphi
\renewcommand{\phi}{\varphi}
\renewcommand{\hat}{\widehat}
\renewcommand{\tilde}{\widetilde}
%various useful quantities
\newcommand{\emp}{P_n}
\newcommand{\est}{\hat{p}_n}
\newcommand{\true}{p_0}
\newcommand{\maxlike}{\max \prod_{i=1}^n p(X_i)}
\newcommand{\hbary}{\bar{h}_y}
\newcommand{\hty}{h^{\mathcal{T}}_y}
\newcommand{\myintegral}{\int_{C_n} \exp\{\hbary(x)\} \, dx}
\newcommand{\kt}{\mathcal{K}^{\mathcal{T}}}
\newcommand{\sigt}{\sigma^{\mathcal{T}}}
\newcommand{\new}{\textrm{new}}
\newcommand{\va}{\mathcal{V}(\mathcal{A})}
%%% Theorem style I like
\newtheorem{theorem}{Theorem}[section]
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{prop}[theorem]{Proposition}
\newtheorem{cor}[theorem]{Corollary}
\theoremstyle{definition}
\newtheorem{definition}[theorem]{Definition}
\newtheorem{remark}[theorem]{Remark}
\newtheorem{eg}[theorem]{Example}
%\theoremstyle{remark}
\newtheorem{assump}[theorem]{Assumption}
%some mathcals and mathbbs
\renewcommand{\AA}{\mathcal{A}}
\newcommand{\VV}{\mathcal{V}}
\newcommand{\KK}{\mathcal{K}}
\newcommand{\RR}{\mathbb{R}}
\newcommand{\FF}{\mathcal{F}}
%%Here is where the actual document begins:
\begin{document}
<>=
options(prompt="R> ")
require( "LogConcDEAD" )
require( "mvtnorm" )
options(width=72)
@
\section{Introduction}
\label{sec:intro}
\subsection{About this document}
\label{sec:about}
This document is an introduction to the \proglang{R} package
\pkg{LogConcDEAD} (log-concave density estimation in arbitrary
dimensions) based on \citet{CGS2008}. It aims to provide a detailed user guide based on simple,
reproducible worked examples. This package is available from the
Comprehensive \proglang{R} Archive Network at
\url{http://CRAN.R-project.org/package=LogConcDEAD}.
\pkg{LogConcDEAD} depends on \pkg{MASS} \citep{MASS2002} for some
vector operations and \pkg{geometry} \citep{GramacyGrasman2008} for
convex hull computation. The package \pkg{rgl}
\citep{AdlerMurdoch2007} is recommended for producing graphics.
This document was created using \code{Sweave} \citep{Sweave} and
\LaTeX{} \citep{Lamport1994} using \proglang{R} \citep{R}. This means
that all of the code has been checked by \proglang{R}, and can be
reproduced exactly by setting an appropriate seed (as given at the
beginning of each example), or tested on different examples by using a
different seed.
\subsection{Log-concave density estimation}
\label{sec:probintro}
We address the fundamental statistical
problem of estimating a probability density function $f_0$ from independent
and identically distributed observations $X_1, \ldots, X_n$ taking
values in $\mathbb{R}^d$.
If a suitable parametric model is available, a common method is to use
maximum likelihood to estimate the parameters of the model.
Otherwise, a standard nonparametric approach is based on kernel
density estimation \citep{WandJones1995}, which has been implemented
in the \proglang{R} function \code{density}. In common with many
nonparametric methods, kernel density estimation requires the careful
specification of a smoothing parameter. For multivariate data, the
smoothing parameter is a bandwidth matrix with up to $\frac{1}{2}d(d+1)$
entries to choose, meaning that this method can
be especially difficult to apply in practice.
An alternative to kernel density estimation or other estimation techniques
based on smoothing (all of which require the selection of a smoothing
parameter, which is nontrivial especially in the multivariate case) is
to impose some qualitative shape restrictions on the density. If the
shape restrictions are suitable, there is enough structure to
guarantee the existence of a unique and fully automatic maximum
likelihood estimate, even though the class of densities may be
infinite-dimensional. This therefore avoids both the restrictions of
a parametric model and the difficulty of bandwidth selection in kernel
density estimation. The price is some restriction on the shape of the
density. However, these restrictions are less severe than those
imposed by a parametric model.
Shape-constrained maximum likelihood dates back to
\citet{Grenander1956}, who treated monotone densities in the context
of mortality data. Recently there has been considerable interest in
alternative shape constraints, including convexity, $k$-monotonicity
and log-concavity \citep{GJW2001, DuembgenRufibach2009,
BalabdaouiWellner2007}. However, these works have all focused on the
case of univariate data.
\subsubsection{Log-concave densities}
A function $g \colon \RR^d \rightarrow
[-\infty, \infty)$ is
concave if
\[g( \lambda x + (1 - \lambda) y) \geq \lambda g(x) + (1-\lambda)g(y)\]
for all $x, y \in \RR^d$ and $\lambda \in (0,1)$. This corresponds to
what \citet{Rockafellar1997} calls a proper concave function.
We say a probability density function $f$ is log-concave if $\log f$
is a concave function.
Several common parametric families of
univariate densities are log-concave, such as Gaussian, logistic and
Gumbel densities, as well as Weibull, Gamma and Beta densities for
certain parameter values \citep{An1998}. In fact, \citet{CSS2010}
showed that even though the class of multivariate log-concave
densities is large (infinite-dimensional), it still retains some of
the simple and attractive properties of the class of Gaussian
densities.
One-dimensional log-concave density estimation via maximum likelihood
is discussed in \citet{DuembgenRufibach2009}; computational aspects are
treated in \citet{Rufibach2007}. It is in the multivariate case,
however, where kernel density estimation is more difficult and
parametric models less obvious, where a log-concave model may be most
useful.
Theoretical and computational aspects of multivariate log-concave
density estimation are treated in \citet{CSS2010}. In particular, it
is proved that if $Y_1, \ldots, Y_m$ are (distinct) independent and
identically distributed observations from a distribution with
log-concave density $f_0$ on $\rd$, then (with probability $1$) there is a
unique log-concave density $\hat{f}_m$ satisfying
\begin{equation}
\label{eq:maxlike}
\hat{f}_m = \argmax_{f \in \FF} \frac{1}{m} \sum_{i=1}^m \log f(Y_i),
\end{equation}
where $\FF$
is the class of all log-concave densities on $\rd$. Further, it is shown that
this infinite dimensional maximization problem can be reduced to that
of maximizing over functions of the form $\hbary$ for some $y =
(y_1,\ldots,y_m) \in \RR^m$, where
\begin{equation}
\label{eq:hbary}
\bar{h}_y(x) = \inf \{h(x) \colon h
\textrm{ is concave},\; h(Y_i) \geq y_i,\; i = 1, \ldots, m\}.
\end{equation}
As discussed in \citet{CSS2010}, we may think of $\hbary$ as the
function obtained by placing a pole of height $y_i$ at $X_i$ and
stretching a rubber sheet over the top of the poles.
Therefore, to completely specify the maximum likelihood estimator, we
need only specify a suitable vector $\hat{y} \in \RR^m$, as this
defines the entire function $\bar{h}_{\hat{y}}$. A main feature of
the \pkg{LogConcDEAD} package is that it provides an iterative
algorithm for finding such an appropriate vector $\hat{y}$.
From our knowledge of the structure of functions of the form \eqref{eq:hbary},
we may deduce some additional properties of $\hat{f}_m$. It is zero
outside the convex hull of the data, and strictly positive inside
the convex hull. Moreover, we can find a triangulation of the convex hull into simplices (triangles when $d=2$, tetrahedra when $d=3$, and
so on) such that $\log \hat{f}_m$ is affine on each simplex \citep{Rockafellar1997}.
In practice our observations will be made only to a finite precision,
so the observations will not necessarily be distinct. However, the
same method of proof shows that, more generally, if $X_1, \ldots, X_n$
are distinct points in $\rd$ and $w_1, \ldots, w_n$ are strictly
positive weights satisfying $\sum_{i=1}^n w_i = 1$, then there is a
unique log-concave density $\hat{f}_n$, which is of the form $\hat{f}_n =
\exp(\bar{h}_y)$ for some $y \in \rn$, and which satisfies
\begin{equation}\label{eq:weights}
\hat{f}_n = \argmax_{f \in \FF} \sum_{i=1}^n w_i \log f(X_i).
\end{equation}
The default case $w_i = \frac{1}{n}$ corresponds to the situation
described above, and is appropriate for most situations. However, the
generalization \eqref{eq:weights} obtained by allowing $w_i \neq
\frac{1}{n}$ allows us to extend to binned
observations. In more detail, if $Y_1, \ldots, Y_m$ are independent
and identically distributed according to a density $f_0$, and distinct
binned values $X_1, \ldots, X_n$ are observed, we may construct a
maximum likelihood problem of the form given in~(\ref{eq:weights}),
setting
\[w_i = \frac{\# \text{ of times value }X_i\text{ is observed}}{m}\]
and
\[\hat{f}_n = \argmax_{f \in \FF} \sum_{i=1}^n w_i \log f(X_i).\]
This generalization may also be used for a multivariate version of a
log-concave EM algorithm (\citealp{ChangWalther2007}, also discussed in \citealp{CSS2010}).
\subsection{Outline of the remainder of this document}
\label{sec:outline}
In Section \ref{sec:alg}, we outline the algorithm used to
compute the maximum likelihood estimator, including various parameters
used in the computation. This is essentially an adaptation of Shor's
$r$-algorithm \citep{Shor1985} (implemented as \pkg{SolvOpt} by
\citet{KappelKuntsevich2000}), and depends on the \pkg{Quickhull}
algorithm for computing convex hulls \citep{BDH1996}.
This section may be skipped on first reading.
In Section \ref{sec:usage}, we demonstrate the main features of the package
through four simple examples (one with $d=1$,
two with $d=2$ and one with $d=3$). This section includes a
description of all of the parameters used, as well as the output
structures. We also introduce the plotting functions available,
as well as functions for sampling from the density estimate and
for evaluating the density at a particular point.
\section{Algorithm}
\label{sec:alg}
\subsection{Introduction}
\label{sec:algintro}
Recall that the maximum likelihood estimator $\hat{f}_n$ of $f_0$
may be completely specified by its values at the observations
$X_1, \ldots, X_n$. Writing $C_n$ for the convex hull of the data, \citet{CSS2010} showed that the problem of
computing the estimator may be rephrased as one of finding
\[\argmin_{y \in \rn} \sigma(y) = - \sum_{i=1}^n w_i y_i +
\myintegral\] for suitable chosen weights $w_i$, where
\[\bar{h}_y(x) = \inf \{h(x): h \textrm{ is concave}, h(X_i) \geq y_i, i
= 1, \ldots, n\}.\]
The function $\sigma$ is convex, but not differentiable, so standard
gradient-based convex optimization techniques such as Newton's method
are not suitable. Nevertheless, the notion of a subgradient is still
valid: a subgradient at $y$ of $\sigma$ is any direction which defines
a supporting hyperplane to $\sigma$ at $y$. \citet{Shor1985}
developed a theory of subgradient methods for handling convex,
non-differentiable optimization problems. The $r$-algorithm,
described in \citet[][Chapter 3]{Shor1985} and implemented as
\pkg{SolvOpt} in \proglang{C} by \citet{KappelKuntsevich2000}, was
found to work particularly well in practice. A main feature of the
\pkg{LogConcDEAD} package is an implementation of an adaptation of this
$r$-algorithm for the particular problem encountered in log-concave
density estimation.
\subsection[Shor's r-algorithm]{Shor's $r$-algorithm}
\label{sec:shor}
Our adaptation of Shor's $r$-algorithm produces a sequence $(y^t)$
with the property that
\[\sigma(y^t) \rightarrow \min_{y \in \rn}
\sigma(y)\]
as $t \rightarrow \infty$. At each iteration, the
algorithm requires the evaluation $\sigma(y^t)$, and the subgradient
at $y^t$, denoted $\partial \sigma(y^t)$, which determines the
direction of the move to the next term $y^{t+1}$ in the sequence.
Exact expressions for $\sigma(y^t)$ and $\partial \sigma(y^t)$ are
provided in \citet{CSS2010}. In practice, their computation requires
the evaluation of convex hulls and triangulations of certain finite
sets of points. This can be done in a fast and robust way via the
\pkg{Quickhull} algorithm \citep{BDH1996}, available in \proglang{R}
through the \pkg{geometry} package \citep{GramacyGrasman2008}. Due to
the presence of some removable singularities in the expressions for
$\sigma(y^t)$ and $\partial \sigma(y^t)$, it is computationally more
stable to use a Taylor approximation to the true values for certain
values of $y^t$ \citep{CuleDuembgen2008}. The values for which a
Taylor expansion (rather than direct evaluation) is used may be
controlled by the argument \code{Jtol} to the \pkg{LogConcDEAD}
function \code{mlelcd}. By default this is $10^{-3}$; altering this
parameter is not recommended.
Several parameters may be used to control the $r$-algorithm as detailed by
\citet{KappelKuntsevich2000}. In the function \code{mlelcd}, they may
be controlled by the user via the arguments \code{stepscale1},
\code{stepscale2}, \code{stepscale3}, \code{stepscale4} and
\code{desiredsize}. For a detailed description of these parameters, as
well as of this implementation of the $r$-algorithm, see
\citet{KappelKuntsevich2000}.
\subsubsection{Stopping criteria}
\label{sec:stopping}
The implementation of the $r$-algorithm used in the main function
\code{mlelcd} terminates after the $(t+1)$th iteration if each of
the following conditions holds:
\begin{align}
\lvert y^{t+1}_i - y^t_i \rvert &\leq \delta \lvert y^t_i
\rvert \textrm{ for } i = 1, \ldots, n \label{ytol}\\
\lvert \sigma(y^{t+1}) - \sigma(y^t) \rvert &\leq \epsilon
\lvert \sigma(y^t) \rvert \label{sigmatol} \\
\left\vert \int_{C_n} \exp\{\bar{h}_{y^t}(x)\}\, dx - 1 \right\vert&
\leq \eta \label{inttol}
\end{align}
for some small tolerances $\delta$, $\epsilon$ and $\eta$.
\eqref{ytol} and \eqref{sigmatol} are the criteria suggested by
\citet{KappelKuntsevich2000}; \eqref{inttol} is based on the
observation that the maximum likelihood estimator is
density \citep{CSS2010}. By default, these values are $\delta =
10^{-4}$, $\epsilon = 10^{-8}$ and $\eta = 10^{-4}$, but they may be
modified by the user as required, using the parameters \code{ytol},
\code{sigmatol} and \code{integraltol} respectively. The default
parameters have been found to work well and it is not recommended to
alter them.
\section{Usage}
\label{sec:usage}
In this section we illustrate the functions available in \pkg{LogConcDEAD}
through several simple simulated data examples. These functions include
\code{mlelcd}, which computes the maximum likelihood estimator, as well as
graphics facilities and the function \code{rlcd} for sampling from the fitted density.
\subsection{Example 1: 1-d data}
\label{sec:eg1d}
<