\documentclass{chapman}
%%% copy Sweave.sty definitions
%%% keeps `sweave' from adding `\usepackage{Sweave}': DO NOT REMOVE
%\usepackage{Sweave}
\RequirePackage[T1]{fontenc}
\RequirePackage{graphicx,ae,fancyvrb}
\IfFileExists{upquote.sty}{\RequirePackage{upquote}}{}
\usepackage{relsize}
\DefineVerbatimEnvironment{Sinput}{Verbatim}{}
\DefineVerbatimEnvironment{Soutput}{Verbatim}{fontfamily=courier,
fontshape=it,
fontsize=\relsize{-1}}
\DefineVerbatimEnvironment{Scode}{Verbatim}{}
\newenvironment{Schunk}{}{}
%%% environment for raw output
\newcommand{\SchunkRaw}{\renewenvironment{Schunk}{}{}
\DefineVerbatimEnvironment{Soutput}{Verbatim}{fontfamily=courier,
fontshape=it,
fontsize=\small}
\rawSinput
}
%%% environment for labeled output
\newcommand{\nextcaption}{}
\newcommand{\SchunkLabel}{
\renewenvironment{Schunk}{\begin{figure}[ht] }{\caption{\nextcaption}
\end{figure} }
\DefineVerbatimEnvironment{Sinput}{Verbatim}{frame = topline}
\DefineVerbatimEnvironment{Soutput}{Verbatim}{frame = bottomline,
samepage = true,
fontfamily=courier,
fontshape=it,
fontsize=\relsize{-1}}
}
%%% S code with line numbers
\DefineVerbatimEnvironment{Sinput}
{Verbatim}
{
%% numbers=left
}
\newcommand{\numberSinput}{
\DefineVerbatimEnvironment{Sinput}{Verbatim}{numbers=left}
}
\newcommand{\rawSinput}{
\DefineVerbatimEnvironment{Sinput}{Verbatim}{}
}
%%% R / System symbols
\newcommand{\R}{\textsf{R}}
\newcommand{\rR}{{R}}
\renewcommand{\S}{\textsf{S}}
\newcommand{\SPLUS}{\textsf{S-PLUS}}
\newcommand{\rSPLUS}{{S-PLUS}}
\newcommand{\SPSS}{\textsf{SPSS}}
\newcommand{\EXCEL}{\textsf{Excel}}
\newcommand{\ACCESS}{\textsf{Access}}
\newcommand{\SQL}{\textsf{SQL}}
%%\newcommand{\Rpackage}[1]{\hbox{\rm\textit{#1}}}
%%\newcommand{\Robject}[1]{\hbox{\rm\texttt{#1}}}
%%\newcommand{\Rclass}[1]{\hbox{\rm\textit{#1}}}
%%\newcommand{\Rcmd}[1]{\hbox{\rm\texttt{#1}}}
\newcommand{\Rpackage}[1]{\index{#1 package@{\fontseries{b}\selectfont #1} package} {\fontseries{b}\selectfont #1}}
\newcommand{\rpackage}[1]{{\fontseries{b}\selectfont #1}}
\newcommand{\Robject}[1]{\texttt{#1}}
\newcommand{\Rclass}[1]{\index{#1 class@\textit{#1} class}\textit{#1}}
\newcommand{\Rcmd}[1]{\index{#1 function@\texttt{#1} function}\texttt{#1}}
\newcommand{\Roperator}[1]{\texttt{#1}}
\newcommand{\Rarg}[1]{\texttt{#1}}
\newcommand{\Rlevel}[1]{\texttt{#1}}
%%% other symbols
\newcommand{\file}[1]{\hbox{\rm\texttt{#1}}}
%%\newcommand{\stress}[1]{\index{#1}\textit{#1}}
\newcommand{\stress}[1]{\textit{#1}}
\newcommand{\booktitle}[1]{\textit{#1}} %%'
%%% Math symbols
\usepackage{amstext}
\usepackage{amsmath}
\newcommand{\E}{\mathsf{E}}
\newcommand{\Var}{\mathsf{Var}}
\newcommand{\Cov}{\mathsf{Cov}}
\newcommand{\Cor}{\mathsf{Cor}}
\newcommand{\x}{\mathbf{x}}
\newcommand{\y}{\mathbf{y}}
\renewcommand{\a}{\mathbf{a}}
\newcommand{\W}{\mathbf{W}}
\newcommand{\C}{\mathbf{C}}
\renewcommand{\H}{\mathbf{H}}
\newcommand{\X}{\mathbf{X}}
\newcommand{\B}{\mathbf{B}}
\newcommand{\V}{\mathbf{V}}
\newcommand{\I}{\mathbf{I}}
\newcommand{\D}{\mathbf{D}}
\newcommand{\bS}{\mathbf{S}}
\newcommand{\N}{\mathcal{N}}
\renewcommand{\L}{L}
\renewcommand{\P}{\mathsf{P}}
\newcommand{\K}{\mathbf{K}}
\newcommand{\m}{\mathbf{m}}
\newcommand{\argmin}{\operatorname{argmin}\displaylimits}
\newcommand{\argmax}{\operatorname{argmax}\displaylimits}
\newcommand{\bx}{\mathbf{x}}
\newcommand{\bbeta}{\mathbf{\beta}}
%%% links
\usepackage{hyperref}
\hypersetup{%
pdftitle = {A Handbook of Statistical Analyses Using R (3rd Edition)},
pdfsubject = {Book},
pdfauthor = {Torsten Hothorn and Brian S. Everitt},
colorlinks = {black},
linkcolor = {black},
citecolor = {black},
urlcolor = {black},
hyperindex = {true},
linktocpage = {true},
}
%%% captions & tables
%% : conflics with figure definition in chapman.cls
%%\usepackage[format=hang,margin=10pt,labelfont=bf]{caption}
%%
\usepackage{longtable}
\usepackage[figuresright]{rotating}
%%% R symbol in chapter 1
\usepackage{wrapfig}
%%% Bibliography
\usepackage[round,comma]{natbib}
\renewcommand{\refname}{References \addcontentsline{toc}{chapter}{References}}
\citeindexfalse
%%% texi2dvi complains that \newblock is undefined, hm...
\def\newblock{\hskip .11em plus .33em minus .07em}
%%% Example sections
\newcounter{exercise}[chapter]
\setcounter{exercise}{0}
\newcommand{\exercise}{\stepcounter{exercise} \item{Ex.~\arabic{chapter}.\arabic{exercise} }}
%% URLs
\newcommand{\curl}[1]{\begin{center} \url{#1} \end{center}}
%%% for manual corrections
%\renewcommand{\baselinestretch}{2}
%%% plot sizes
\setkeys{Gin}{width=0.95\textwidth}
%%% color
\usepackage{color}
%%% hyphenations
\hyphenation{drop-out}
\hyphenation{mar-gi-nal}
%%% new bidirectional quotes need
\usepackage[utf8]{inputenc}
%\usepackage{setspace}
\definecolor{sidebox_todo}{rgb}{1,1,0.2}
\newcommand{\todo}[1]{
\hspace{0pt}%
\marginpar{%
\fcolorbox{black}{sidebox_todo}{%
\parbox{\marginparwidth} {
\raggedright\sffamily\footnotesize{TODO: #1}%
}
}%
}
}
\begin{document}
%% Title page
\title{A Handbook of Statistical Analyses Using \R{} --- 3rd Edition}
\author{Torsten Hothorn and Brian S. Everitt}
\maketitle
%%\VignetteIndexEntry{Multiple Linear Regression}
%%\VignetteDepends{wordcloud}
\setcounter{chapter}{5}
\SweaveOpts{prefix.string=figures/HSAUR,eps=FALSE,keep.source=TRUE}
<>=
rm(list = ls())
s <- search()[-1]
s <- s[-match(c("package:base", "package:stats", "package:graphics", "package:grDevices",
"package:utils", "package:datasets", "package:methods", "Autoloads"), s)]
if (length(s) > 0) sapply(s, detach, character.only = TRUE)
if (!file.exists("tables")) dir.create("tables")
if (!file.exists("figures")) dir.create("figures")
set.seed(290875)
options(prompt = "R> ", continue = "+ ",
width = 63, # digits = 4,
show.signif.stars = FALSE,
SweaveHooks = list(leftpar = function()
par(mai = par("mai") * c(1, 1.05, 1, 1)),
bigleftpar = function()
par(mai = par("mai") * c(1, 1.7, 1, 1))))
HSAURpkg <- require("HSAUR3")
if (!HSAURpkg) stop("cannot load package ", sQuote("HSAUR3"))
rm(HSAURpkg)
### hm, R-2.4.0 --vanilla seems to need this
a <- Sys.setlocale("LC_ALL", "C")
###
book <- TRUE
refs <- cbind(c("AItR", "DAGD", "SI", "CI", "ANOVA", "MLR", "GLM",
"DE", "RP", "GAM", "SA", "ALDI", "ALDII", "SIMC", "MA", "PCA",
"MDS", "CA"), 1:18)
ch <- function(x) {
ch <- refs[which(refs[,1] == x),]
if (book) {
return(paste("Chapter~\\\\ref{", ch[1], "}", sep = ""))
} else {
return(paste("Chapter~", ch[2], sep = ""))
}
}
if (file.exists("deparse.R"))
source("deparse.R")
setHook(packageEvent("lattice", "attach"), function(...) {
lattice.options(default.theme =
function()
standard.theme("pdf", color = FALSE))
})
@
\pagestyle{headings}
<>=
book <- FALSE
@
<>=
library("wordcloud")
@
\chapter[Simple and Multiple Linear Regression]{Simple and Multiple Linear Regression: \\ How Old is the Universe and Cloud Seeding \label{MLR}}
\section{Introduction}
\index{Age of the Universe}
\cite{HSAUR:Freedmanetal2001} give the relative velocity and the distance of $24$ galaxies,
according to measurements made using the Hubble Space Telescope -- the data
are contained in the \Rpackage{gamair} package accompanying \cite{HSAUR:Wood2006}, see
Table~\ref{MLR-hubble-tab}.
Velocities are assessed by measuring the Doppler red shift in the spectrum of
light observed from the galaxies concerned, although some correction
for `local' velocity components is required. Distances are measured
using the known relationship between the period of Cepheid variable
stars and their luminosity. How can these data be used to estimate
the age of the universe? Here we shall show how this can be done
using simple linear regression.
<>=
data("hubble", package = "gamair")
names(hubble) <- c("galaxy", "velocity", "distance")
toLatex(HSAURtable(hubble, package = "gamair"), pcol = 2,
caption = paste("Distance and velocity for 24 galaxies."),
label = "MLR-hubble-tab")
@
\vspace*{-1cm}
\textit{Source}: From Freedman W. L., et al., \textit{The Astrophysical Journal},
553, 47--72, 2001. With permission.
\vspace*{1cm}
\index{Cloud seeding}
{\tabcolsep3.5pt
<>=
data("clouds", package = "HSAUR3")
names(clouds) <- c("seeding", "time", "sne", "cloudc", "prewet", "EM", "rain")
toLatex(HSAURtable(clouds), pcol = 1,
caption = paste("Cloud seeding experiments in Florida -- see text for",
"explanations of the variables. Note that the \\Robject{clouds} data set has slightly different variable names."),
label = "MLR-clouds-tab")
@
}
Weather modification, or cloud seeding, is the treatment of individual
clouds or storm systems with various inorganic and organic materials
in the hope of achieving an increase in rainfall. Introduction
of such material into a cloud that contains supercooled water,
that is, liquid water colder than zero degrees Celsius,
has the aim of
inducing freezing, with the consequent ice particles growing
at the expense of liquid droplets and becoming heavy enough to
fall as rain from clouds that otherwise would produce none.
The data shown in Table~\ref{MLR-clouds-tab} were collected in the summer
of 1975 from an experiment to investigate the use of massive
amounts of silver iodide ($100$ to $1000$ grams per cloud) in cloud
seeding to increase rainfall \citep{HSAUR:Woodleyetal1977}.
In the experiment, which was conducted
in an area of Florida, 24 days were judged suitable for seeding
on the basis that a measured suitability criterion, denoted \stress{S-Ne},
was not less than $1.5$. Here \stress{S} is the `seedability', %'
the difference between the maximum height of a cloud if seeded
and the same cloud if not seeded predicted by a suitable cloud
model, and \stress{Ne} is the number of hours between $1300$
and $1600$ G.M.T. with $10$ centimeter echoes in the target; this
quantity biases the decision for experimentation against naturally
rainy days. Consequently, optimal days for seeding are those
on which seedability is large and the natural rainfall early
in the day is small.
On suitable days, a decision was taken at random as to whether
to seed or not. For each day the following variables were measured:
\begin{description}
\item[\Robject{seeding}] a factor indicating whether seeding action occurred (yes or no),
\item[\Robject{time}] number of days after the first day of the experiment,
\item[\Robject{cloudc}] the percentage cloud cover in the experimental area,
measured using radar,
\item[\Robject{prewet}] the total rainfall in the target area one hour before seeding
(in cubic meters $\times 10^{7}$),
\item[\Robject{EM}] a factor showing whether the radar echo was moving or
stationary,
\item[\Robject{rain}] the amount of rain in cubic meters $\times 10^{7}$,
\item[\Robject{sne}] suitability criterion, see above.
\end{description}
The objective in analyzing these data is to see how rainfall
is related to the explanatory variables and, in particular, to
determine the effectiveness of seeding. The method to be used
is \stress{multiple linear regression}.
\section{Simple Linear Regression}
\section{Multiple Linear Regression \label{MLR-MLR}}
\subsection{Regression Diagnostics}
\section{Analysis Using \R{}}
\subsection{Estimating the Age of the Universe}
Prior to applying a simple regression to the data it will
be useful to look at a plot to assess their major features.
The \R{} code given in Figure~\ref{MLR-hubble-plot} produces
a scatterplot of velocity and distance.
\begin{figure}
\begin{center}
<>=
plot(velocity ~ distance, data = hubble)
@
\caption{Scatterplot of velocity and distance. \label{MLR-hubble-plot}}
\end{center}
\end{figure}
The diagram shows a clear, strong relationship between velocity
and distance. The next step is to fit a simple linear regression model
to the data, but in this case the nature of the data requires a model
without intercept because if distance is zero so is relative speed.
So the model to be fitted to these data is
\begin{eqnarray*}
\text{velocity} = \beta_1 \text{distance} + \varepsilon.
\end{eqnarray*}
This is essentially what astronomers call Hubble's Law and
$\beta_1$ is known as Hubble's constant; $\beta_1^{-1}$ gives
an approximate age of the universe.
To fit this model we are estimating $\beta_1$ using formula
(\ref{MLR:beta1}). Although this
operation is rather easy
<>=
sum(hubble$distance * hubble$velocity) /
sum(hubble$distance^2)
@
it is more convenient to apply \R's linear modeling function
<>=
hmod <- lm(velocity ~ distance - 1, data = hubble)
@
Note that the model formula specifies a model without intercept.
We can now extract the estimated model coefficients via
<>=
coef(hmod)
@
and add this estimated regression line to the scatterplot; the
result is shown in Figure~\ref{MLR-hubble-lmplot}. In addition,
we produce a scatterplot of the residuals $y_i - \hat{y}_i$ against
fitted values $\hat{y}_i$ to assess the quality of the model fit.
It seems that for higher distance values the variance of velocity
increases; however, we are interested in only the estimated
parameter $\hat{\beta}_1$ which remains valid under variance
heterogeneity (in contrast to $t$-tests and associated $p$-values).
Now we can use the estimated value of $\beta_1$ to find an approximate value
for the age of the universe. The Hubble constant itself has units of
$\text{km} \times \text{sec}^{-1} \times \text{Mpc}^{-1}$. A mega-parsec (Mpc)
is $3.09 \times 10^{19}$km, so we need to divide the estimated value of $\beta_1$
by this amount in order to obtain Hubble's constant with units of $\text{sec}^{-1}$.
The approximate age of the universe in seconds will then be the inverse of
this calculation. Carrying out the necessary computations
<>=
Mpc <- 3.09 * 10^19
ysec <- 60^2 * 24 * 365.25
Mpcyear <- Mpc / ysec
1 / (coef(hmod) / Mpcyear)
@
gives an estimated age of roughly $12.8$ billion years.
\begin{figure}
\begin{center}
<>=
layout(matrix(1:2, ncol = 2))
plot(velocity ~ distance, data = hubble)
abline(hmod)
plot(hmod, which = 1)
@
\caption{Scatterplot of velocity and distance with estimated
regression line (left) and plot of residuals against fitted values (right).
\label{MLR-hubble-lmplot}}
\end{center}
\end{figure}
\subsection{Cloud Seeding}
Again, a graphical display highlighting the most important
aspects of the data will be helpful.
Here we will construct boxplots of the rainfall in each category
of the dichotomous explanatory variables and scatterplots of
rainfall against each of the continuous explanatory variables.
\begin{figure}
\begin{center}
<>=
data("clouds", package = "HSAUR3")
layout(matrix(1:2, nrow = 2))
bxpseeding <- boxplot(rain ~ seeding, data = clouds,
ylab = "Rainfall", xlab = "Seeding")
bxpecho <- boxplot(rain ~ EM, data = clouds,
ylab = "Rainfall", xlab = "Echo Motion")
@
<>=
layout(matrix(1:2, nrow = 2))
bxpseeding <- boxplot(rain ~ seeding, data = clouds,
ylab = "Rainfall", xlab = "Seeding")
bxpecho <- boxplot(rain ~ EM, data = clouds,
ylab = "Rainfall", xlab = "Echo Motion")
@
\caption{Boxplots of \Robject{rain}. \label{MLR-rainfall-boxplot}}
\end{center}
\end{figure}
\begin{figure}
\begin{center}
<>=
layout(matrix(1:4, nrow = 2))
plot(rain ~ time, data = clouds)
plot(rain ~ cloudc, data = clouds)
plot(rain ~ sne, data = clouds, xlab="S-Ne criterion")
plot(rain ~ prewet, data = clouds)
@
\caption{Scatterplots of \Robject{rain} against the continuous
covariates. \label{MLR-rainfall-scplot}}
\end{center}
\end{figure}
Both the boxplots (Figure~\ref{MLR-rainfall-boxplot}) and the scatterplots
(Figure~\ref{MLR-rainfall-scplot}) show some evidence
of outliers. The row names of the extreme observations in the
\Robject{clouds} \Rclass{data.frame} can be identified via
<>=
rownames(clouds)[clouds$rain %in% c(bxpseeding$out,
bxpecho$out)]
@
where \Robject{bxpseeding} and \Robject{bxpecho} are variables
created by \Rcmd{boxplot} in Figure~\ref{MLR-rainfall-boxplot}.
Now we shall not remove these observations but bear
in mind during the modeling process that they may cause problems.
In this example it is sensible to assume that the effect of
some of the other explanatory variables is modified by seeding
and therefore consider a model that includes seeding as
covariate and, furthermore, allows interaction terms
\index{Interaction}
for \Robject{seeding} with each of the covariates except \Robject{time}.
This model can be described by the \Rclass{formula}
<>=
clouds_formula <- rain ~ seeding +
seeding:(sne + cloudc + prewet + EM) +
time
@
and the design matrix $\X^\star$ can be computed via
<>=
Xstar <- model.matrix(clouds_formula, data = clouds)
@
By default, treatment contrasts have been applied to the dummy codings of
the factors \Robject{seeding} and \Robject{EM} as can be seen from
the inspection of the \Robject{contrasts} attribute of the model matrix
<>=
attr(Xstar, "contrasts")
@
The default contrasts can be changed via the \Rarg{contrasts.arg}
argument to \Rcmd{model.matrix} or the \Robject{contrasts} argument to the
fitting function, for example \Rcmd{lm} or \Rcmd{aov} as shown in
\Sexpr{ch("ANOVA")}.
However, such internals are hidden and performed by high-level model-fitting
functions such as \Rcmd{lm} which will be used to fit the linear model
defined by the \Rclass{formula} \Robject{clouds\_formula}:
<>=
clouds_lm <- lm(clouds_formula, data = clouds)
class(clouds_lm)
@
The result of the model fitting is an object of class \Rclass{lm} for which
a \Rcmd{summary} method showing the conventional regression analysis
output is available. The output in Figure~\ref{MLR-clouds-summary}
shows the estimates $\hat{\beta}^\star$ with corresponding standard errors
and $t$-statistics as well as the $F$-statistic with associated $p$-value.
\renewcommand{\nextcaption}{\R{} output of the linear model fit
for the \Robject{clouds} data.
\label{MLR-clouds-summary}}
\SchunkLabel
<>=
summary(clouds_lm)
@
\SchunkRaw
Many methods are available for extracting components of the fitted model.
The estimates $\hat{\beta}^\star$ can be assessed via
\newpage
<>=
betastar <- coef(clouds_lm)
betastar
@
and the corresponding covariance matrix $\Cov(\hat{\beta}^\star)$ is available
from the \Rcmd{vcov} method
<>=
Vbetastar <- vcov(clouds_lm)
@
where the square roots of the diagonal elements are the standard errors as
shown in Figure~\ref{MLR-clouds-summary}
<>=
sqrt(diag(Vbetastar))
@
\begin{figure}
\begin{center}
<>=
psymb <- as.numeric(clouds$seeding)
plot(rain ~ sne, data = clouds, pch = psymb,
xlab = "S-Ne criterion")
abline(lm(rain ~ sne, data = clouds,
subset = seeding == "no"))
abline(lm(rain ~ sne, data = clouds,
subset = seeding == "yes"), lty = 2)
legend("topright", legend = c("No seeding", "Seeding"),
pch = 1:2, lty = 1:2, bty = "n")
@
\caption{Regression relationship between S-Ne criterion and rainfall with
and without seeding. \label{MLR-clouds-lmplot}}
\end{center}
\end{figure}
In order to investigate the quality of the model fit, we need access to the
residuals and the fitted values. The residuals can be found by the
\Rcmd{residuals} method and the fitted values of the response
from the \Rcmd{fitted} (or \Rcmd{predict}) method
<>=
clouds_resid <- residuals(clouds_lm)
clouds_fitted <- fitted(clouds_lm)
@
Now the residuals and the fitted values
can be used to construct diagnostic plots; for example the residual
plot in Figure~\ref{MLR-resid} where each observation is labelled by its
number (using \Rcmd{textplot} from package \Rpackage{wordclouds}).
Observations $1$ and $15$ give rather large residual values and the
data should perhaps be reanalysed after these two observations are removed.
The normal probability plot of the residuals shown in Figure~\ref{MLR-qqplot}
shows a reasonable agreement between theoretical and sample
quantiles, however, observations $1$ and $15$ are extreme again.
\begin{figure}
\begin{center}
<>=
plot(clouds_fitted, clouds_resid, xlab = "Fitted values",
ylab = "Residuals", type = "n",
ylim = max(abs(clouds_resid)) * c(-1, 1))
abline(h = 0, lty = 2)
textplot(clouds_fitted, clouds_resid,
words = rownames(clouds), new = FALSE)
@
\caption{Plot of residuals against fitted values for \Robject{clouds} seeding data.
\label{MLR-resid}}
\end{center}
\end{figure}
\begin{figure}
\begin{center}
<>=
qqnorm(clouds_resid, ylab = "Residuals")
qqline(clouds_resid)
@
\caption{Normal probability plot of residuals from cloud seeding model
\Robject{clouds\_lm}. \label{MLR-qqplot}}
\end{center}
\end{figure}
An index plot of the Cook's distances for each observation %'
(and many other plots including those constructed above from
using the basic functions) can be found from applying the \Rcmd{plot} method
to the object that results from the application
of the \Rcmd{lm} function.
\begin{figure}
\begin{center}
<>=
plot(clouds_lm)
@
<>=
plot(clouds_lm, which = 4, sub.caption = NULL)
@
\caption{Index plot of Cook's distances for cloud seeding data. %'
\label{MLR-cook}}
\end{center}
\end{figure}
Figure~\ref{MLR-cook} suggests that observations 2 and 18 have undue
influence on the
estimated regression coefficients, but the two outliers identified
previously do not. Again it may be useful to look at the results
after these two observations have been removed (see Exercise
6.2).
%% \ref{MLR-ex2})
\index{Regression diagnostics|)}
%%\bibliographystyle{LaTeXBibTeX/refstyle}
%%\bibliography{LaTeXBibTeX/HSAUR}
\end{document}