Comparing Two Groups
$$
\[ \small{ \begin{array}{r|rr|rr|r|rr|rrrrr} \text{call} & 1 & & 2 & & \dots & 625 & & & & & & \\ \text{question} & X_1 & Y_1 & X_2 & Y_2 & \dots & X_{625} & Y_{625} & \overline{X}_{625} & \overline{Y}_{625} &\frac{\sum_{i:X_i=0} Y_i}{\sum_{i:X_i=0} 1} & \frac{\sum_{i:X_i=1} Y_i}{\sum_{i:X_i=1} 1} & \text{difference} \\ \text{outcome} & \underset{\textcolor{gray}{x_{869369}}}{\textcolor[RGB]{248,118,109}{0}} & \underset{\textcolor{gray}{y_{869369}}}{\textcolor[RGB]{248,118,109}{1}} & \underset{\textcolor{gray}{x_{4428455}}}{\textcolor[RGB]{0,191,196}{1}} & \underset{\textcolor{gray}{y_{4428455}}}{\textcolor[RGB]{0,191,196}{1}} & \dots & \underset{\textcolor{gray}{x_{1268868}}}{\textcolor[RGB]{248,118,109}{0}} & \underset{\textcolor{gray}{y_{1268868}}}{\textcolor[RGB]{248,118,109}{1}} & 0.28 & 0.68 & \textcolor[RGB]{248,118,109}{0.68} & \textcolor[RGB]{0,191,196}{0.69} & 0.01 \\ \end{array} } \]
income | education | county | |
---|---|---|---|
1 | $55k | 13 | orange |
2 | $25k | 13 | LA |
3 | $44k | 16 | san joaquin |
4 | $22k | 14 | orange |
⋮ | |||
2271 | $150k | 16 | stanislaus |
income | education | county | |
---|---|---|---|
1 | $22k | 18 | unknown |
2 | $0k | 16 | solano |
3 | $98k | 16 | LA |
⋮ | |||
5677500 | $116k | 18 | unknown |
income | education | county | |
---|---|---|---|
1 | $55k | 13 | orange |
2 | $25k | 13 | LA |
3 | $44k | 16 | san joaquin |
⋮ | |||
2271 | $150k | 16 | stanislaus |
For illustration, I’ve made up a fake population that looks like a bigger version of the sample.
\[ \begin{aligned} \mu(1) - \mu(0) \qfor \mu(x) &= \frac{1}{m_x}\sum_{j:x_j = x } y_j \\ \qqtext{ where } \quad m_x &= \sum_{j:x_j=x} 1 \end{aligned} \]
\[ \begin{aligned} \hat\mu(1) - \hat\mu(0) \qfor \hat\mu(x) &= \frac{1}{N_x}\sum_{i:X_i=x} Y_i \\ \qqtext{ where } \quad N_x &= \sum_{i:X_i=x} 1. \end{aligned} \]
\[ \begin{aligned} \mathop{\mathrm{E}}[Y] &= \sum_{j=1}^m \underset{\text{response}}{y_j} \times \underset{\text{probability}}{\frac{1}{m}} = \frac{1}{m}\sum_{j=1}^m y_j = \mu \\ \mathop{\mathrm{E}}[(Y-\mathop{\mathrm{E}}[Y])^2] &= \sum_{j=1}^m \underset{\text{deviation}^2}{(y_j - \mu)^2} \times \underset{\text{probability}}{\frac{1}{m}} = \frac{1}{m}\sum_{j=1}^m (y_j - \mu)^2 = \sigma^2 \end{aligned} \]
\[ \begin{aligned} P(J_i=j) = \begin{cases} \frac{m_{green}}{m} \ \times \frac{1}{m_{green}} \ = \ \frac{1}{m} & \text{if the $j$th dot is green ($x_j=1$) } \\ \frac{m_{red}}{m} \ \times \frac{1}{m_{red}} \ = \ \frac{1}{m} & \text{otherwise} \end{cases} \end{aligned} \]
\[ \mathop{\mathrm{\mathop{\mathrm{V}}}}[Y \mid X] = \mathop{\mathrm{E}}[\textcolor[RGB]{0,0,255}{\{Y-\mu(X)\}}^2 \mid X] \qfor \mu(X) = \mathop{\mathrm{E}}[Y \mid X] \]
\[ \begin{aligned} E ( a Y + b Z ) &= E (aY) + E (bZ) = aE(Y) + bE(Z) \\ &\text{ for random variables $Y, Z$ and numbers $a,b$ } \end{aligned} \]
Application. When we sample uniformly-at-random, the sample mean is unbiased.
\[ \begin{aligned} \mathop{\mathrm{E}}[\hat\mu] &= \mathop{\mathrm{E}}\qty[\frac1n\sum_{i=1}^n Y_i] \\ &= \frac1n\sum_{i=1}^n \mathop{\mathrm{E}}[Y_i] \\ &= \frac1n\sum_{i=1}^n \mu \\ &= \frac1n \times n \times \mu = \mu. \end{aligned} \]
Conditional Version. \[ \begin{aligned} E\{ a(X) Y + b(X) Z \mid X \} &= E\{a(X)Y \mid X\} + E\{ b(X)Z \mid X\} \\ &= a(X)E(Y \mid X) + b(X)E(Z \mid X) \\ & \text{ for random variables $X, Y, Z$ and functions $a,b$ } \end{aligned} \]
\[ \color{gray} \mathop{\mathrm{E}}[\textcolor[RGB]{239,71,111}{Y}\textcolor[RGB]{17,138,178}{Z}] = \textcolor[RGB]{239,71,111}{\mathop{\mathrm{E}}[Y]}\textcolor[RGB]{17,138,178}{\mathop{\mathrm{E}}[Z]} \qqtext{when $\textcolor[RGB]{239,71,111}{Y}$ and $\textcolor[RGB]{17,138,178}{Z}$ are independent} \]
Application. When we sample with replacement, the sample mean’s variance is the population variance divided by \(n\).
\[ \color{gray} \begin{aligned} \mathop{\mathrm{\mathop{\mathrm{V}}}}[\hat\mu] &= \mathop{\mathrm{E}}\qty[ \qty{ \frac{1}{n}\sum_{i=1}^n Y_i - \mathop{\mathrm{E}}\qty[ \frac{1}{n}\sum_{i=1}^n Y_i ] }^2 ] \\ &= \mathop{\mathrm{E}}\qty[ \qty{ \frac{1}{n}\sum_{i=1}^n (Y_i - \mathop{\mathrm{E}}[Y_i]) }^2 ] \\ &= \mathop{\mathrm{E}}\qty[ \qty{ \frac{1}{n}\sum_{i=1}^n Z_i }^2 ] && \text{ for } \ Z_i = Y_i - \mathop{\mathrm{E}}[Y_i] \\ &= \mathop{\mathrm{E}}\qty[ \qty{ \textcolor[RGB]{239,71,111}{\frac{1}{n}\sum_{i=1}^n Z_i }} \times \qty{\textcolor[RGB]{17,138,178}{\frac{1}{n}\sum_{j=1}^n Z_j}} ] &&\text{ with } \mathop{\mathrm{E}}[ Z_i ] = \mathop{\mathrm{E}}[ Y_i ] - \mathop{\mathrm{E}}[Y_i] = \mu - \mu = 0 \\ &= \mathop{\mathrm{E}}\qty[ \frac{1}{n^2}\textcolor[RGB]{239,71,111}{\sum_{i=1}^n} \textcolor[RGB]{17,138,178}{\sum_{j=1}^n} \textcolor[RGB]{239,71,111}{Z_i} \textcolor[RGB]{17,138,178}{Z_j} ] &&\text{ and } \ \mathop{\mathrm{E}}[ Z_i^2] = \mathop{\mathrm{E}}[ \{ Y_i - \mathop{\mathrm{E}}[Y_i] \}^2 ] = \mathop{\mathrm{\mathop{\mathrm{V}}}}[Y_i] = \sigma^2 \\ &= \frac{1}{n^2} \textcolor[RGB]{239,71,111}{\sum_{i=1}^n} \textcolor[RGB]{17,138,178}{\sum_{j=1}^n} \mathop{\mathrm{E}}\qty[\textcolor[RGB]{239,71,111}{Z_i} \textcolor[RGB]{17,138,178}{Z_j} ] \\ &= \frac{1}{n^2} \textcolor[RGB]{239,71,111}{\sum_{i=1}^n} \textcolor[RGB]{17,138,178}{\sum_{j=1}^n} \begin{cases} \mathop{\mathrm{E}}[Z_i^2]=\sigma^2 & \text{if } j=i \\ \textcolor[RGB]{239,71,111}{\mathop{\mathrm{E}}[Z_i]}\textcolor[RGB]{17,138,178}{\mathop{\mathrm{E}}[Z_j]} = 0 \times 0 & \text{if } j\neq i \end{cases} \\ &= \frac{1}{n^2} \textcolor[RGB]{239,71,111}{\sum_{i=1}^n} \sigma^2 = \frac{1}{n^2} \times n \times \sigma^2 = \frac{\sigma^2}{n} \end{aligned} \]
\[ \mathop{\mathrm{E}}[Y] = \mathop{\mathrm{E}}\qty[ \mathop{\mathrm{E}}( Y \mid X ) ] \quad \text{ for any random variables $X, Y$} \]
\(p\) | \(X_i\) | \(\mathop{\mathrm{E}}[Y_i \mid X_i]\) |
---|---|---|
\(\frac{3}{6}\) | \(0\) | \(1\) |
\(\frac{3}{6}\) | \(1\) | \(1.25\) |
What is \(\mathop{\mathrm{E}}[Y_i]\)?
\(p\) | \(X_i\) | \(\mathop{\mathrm{E}}[Y_i \mid X_i]\) |
---|---|---|
\(0.58\) | 0 | \(31.22K\) |
\(0.42\) | 1 | \(72.35K\) |
What is \(\mathop{\mathrm{E}}[Y_i]\)?
\[ \mathop{\mathrm{E}}\qty[Y \mid X] = \mathop{\mathrm{E}}\qty[ \ \mathop{\mathrm{E}}[Y \mid W, X] \ \mid X] \quad \text{ for any random variables $W, X, Y$} \]
\(p\) | \(W\) | \(X\) | \(\mathop{\mathrm{E}}[Y \mid W, X]\) |
---|---|---|---|
\(\frac{3}{12}\) | ● | 0 | \(1\) |
\(\frac{3}{12}\) | ▴ | 0 | \(1.75\) |
\(\frac{3}{12}\) | ● | 1 | \(1.25\) |
\(\frac{3}{12}\) | ▴ | 1 | \(2\) |
The function is the gray ● and ▴ in the plot.
\(p\) | \(X\) | \(\mathop{\mathrm{E}}[Y \mid X]\) |
---|---|---|
\(\frac{6}{12}\) | 0 | \(1.375\) |
\(\frac{6}{12}\) | 1 | \(1.625\) |
The function is the gray ◆ in the plot.
1.5
That’s midway
between the
two diamonds.
\[ \mathop{\mathrm{E}}[ \textcolor[RGB]{17,138,178}{Y} \mid \textcolor[RGB]{17,138,178}{X}, \textcolor[RGB]{239,71,111}{X'} ] = \textcolor[RGB]{17,138,178}{\mathop{\mathrm{E}}[ Y \mid X ]} \quad \text{ when $X'$ is independent of $X,Y$ } \]
Application. When we sample with replacement, the conditional expectation of \(Y_i\) given \(X_1 \ldots X_n\) is the conditional mean of \(Y_i\) given \(X_i\) alone.
\[ \mathop{\mathrm{E}}[Y_i \mid X_1 \ldots X_n] = \mathop{\mathrm{E}}[Y_i \mid X_i] = \mu(X_i) \qqtext{ when $(X_1, Y_1) \ldots (X_n, Y_n)$ are independent.} \]
\[ 1_{=x}(X)\mu(X) = \begin{cases} 1 \times \mu(X) & \text{ when } X=x \\ 0 \times \mu(X) & \text{ when } X \neq x \end{cases} = \begin{cases} 1 \times \mu(x) & \text{ when } X=x \\ 0 \times \mu(x) & \text{ when } X \neq x \end{cases} = 1_{=x}(X)\mu(x) \]
Application.
\[ \mathop{\mathrm{E}}[1_{=x}(X)\mu(X)] = \mathop{\mathrm{E}}[ 1_{X=x} \mu(x) ] = \mu(x) \mathop{\mathrm{E}}[1_{=x}(X)] = \mu(x) \times P(X=x) \]
Claim. When we sample uniformly-at-random, the sample mean is an unbiased estimator of the population mean. \[ \mathop{\mathrm{E}}[\hat\mu] = \mu \]
We proved this earlier today using linearity of expectations. See Slide 3.1.
Claim. When we sample with replacement, a subsample mean is an unbiased estimator of the corresponding subpopulation mean. \[ \mathop{\mathrm{E}}[\hat\mu(1)] = \mu(1) \]
\[ \hat\mu(1) = \frac{\sum_{i:X_i=1} Y_i}{\sum_{i:X_i=1} 1} = \frac{\sum_{i=1}^{n} 1_{=1}(X_i) Y_{i}}{\sum_{i=1}^{n} 1_{=1}(X_i)} \]
\[ \begin{aligned} \mathop{\mathrm{E}}[\hat\mu(1) \mid X_1 \ldots X_n] &=\mathop{\mathrm{E}}\qty[\frac{\sum_{i=1}^{n} 1_{=1}(X_i) Y_{i}}{\sum_{i=1}^{n} 1_{=1}(X_i)} \mid X_1 \ldots X_n] \\ &\overset{\texttip{\small{\unicode{x2753}}}{via linearity. All the indicators (and their sum) are functions of $X_1\ldots X_n$.}}{=} \frac{\sum_{i=1}^{n} 1_{=1}(X_i) \mathop{\mathrm{E}}\qty{ Y_{i} \mid X_1 \ldots X_n}}{\sum_{i=1}^{n}1_{=1}(X_i)} \\ &\overset{\texttip{\small{\unicode{x2753}}}{via irrelevance of independent conditioning variables. $(X_i,Y_i)$ are independent of the other $X$s.}}{=} \frac{\sum_{i=1}^{n} 1_{=1}(X_i) \mathop{\mathrm{E}}\qty{ Y_{i} \mid X_i}}{\sum_{i=1}^{n}1_{=1}(X_i)} \\ &\overset{\texttip{\small{\unicode{x2753}}}{This is just a change of notation, as $\mu(X_i)=\mathop{\mathrm{E}}[Y_i \mid X_i]$, but it makes it easier to understand what happens next.}}{=} \frac{\sum_{i=1}^{n} 1_{=1}(X_i) \mu(X_i)}{\sum_{i=1}^{n}1_{=1}(X_i)} \\ &\overset{\texttip{\small{\unicode{x2753}}}{via the indicator trick.}}{=}\frac{\sum_{i=1}^{n}1_{=1}(X_i) \mu(1)}{\sum_{i=1}^{n}1_{=1}(X_{i})} \\ &\overset{\texttip{\small{\unicode{x2753}}}{via linearity}}{=} \mu(1) \ \frac{\sum_{i=1}^{n} 1_{=1}(X_i) }{\sum_{i=1}^{n}1_{=1}(X_{i})} = \mu(1) \end{aligned} \]
and therefore, using the law of iterated expectations,
\[ \mathop{\mathrm{E}}[\hat\mu(1)] = \mathop{\mathrm{E}}\qty[\mathop{\mathrm{E}}\qty[\hat\mu(1) \mid X_1 \ldots X_n]] = \mathop{\mathrm{E}}[\mu(1)] = \mu(1) \]
Claim. A difference in subsample means is unbiased for the corresponding difference in subpopulation means.
\[ \mathop{\mathrm{E}}[\hat\mu(1) - \hat\mu(0)] = \mu(1) - \mu(0) \]
Why?
This follows from the linearity of expectations and unbiasedness of the subsample means.
\[ \mathop{\mathrm{E}}[\hat\mu(1) - \hat\mu(0)] = \mathop{\mathrm{E}}[\hat\mu(1)] - \mathop{\mathrm{E}}[\hat\mu(0)] = \mu(1) - \mu(0) \]
Is it conditionally unbiased?
It is. That follows from linearity of conditional expectations and conditional unbiasedness of the subsample means.
\[ \mathop{\mathrm{E}}[\hat\mu(1) - \hat\mu(0) \mid X_1 \ldots X_n] = \mathop{\mathrm{E}}[\hat\mu(1) \mid X_1 \ldots X_n] - \mathop{\mathrm{E}}[\hat\mu(0) \mid X_1 \ldots X_n] = \mu(1) - \mu(0) \]
\[ \mathop{\mathrm{\mathop{\mathrm{V}}}}[Y] = \mathop{\mathrm{E}}\qty{\mathop{\mathrm{\mathop{\mathrm{V}}}}( Y \mid X ) } + \mathop{\mathrm{\mathop{\mathrm{V}}}}\qty{\mathop{\mathrm{E}}( Y \mid X ) } \]
We won’t prove this, but it’s in the slides if you’re interested. See Slide 8.
Claim. When we sample w/ replacement, the variance of the sample mean
is the population variance divided by the number of people in the sample.
\[ \mathop{\mathrm{\mathop{\mathrm{V}}}}[\hat\mu] = \frac{\sigma^2}{n} \]
When we sample with replacement, the conditional expectation of products factors into a product of conditional expectations.
\[ \mathop{\mathrm{E}}[Y_i \ Y_j \mid X_1 \ldots X_n] = \mathop{\mathrm{E}}[Y_i \mid X_i]\mathop{\mathrm{E}}[Y_j \mid X_j] = \mu(X_i)\mu(X_j) \qqtext{ when $(X_1, Y_1) \ldots (X_n, Y_n)$ are independent.} \]
This is a subtle application of two of our ‘laws’. \[ \begin{aligned} &\mathop{\mathrm{E}}[Y \mid X] = \mathop{\mathrm{E}}[ \mathop{\mathrm{E}}\qty[Y \mid W, X] \mid X] \quad \text{ for any random variables $W, X, Y$} && \text{Law of Iterated Iterated Expectations} \\ &\mathop{\mathrm{E}}[ \textcolor[RGB]{17,138,178}{Y} \mid \textcolor[RGB]{17,138,178}{X}, \textcolor[RGB]{239,71,111}{X'} ] = \textcolor[RGB]{17,138,178}{\mathop{\mathrm{E}}[ Y \mid X ]} \quad \text{ when $X'$ is independent of $X,Y$ } && \text{Irrelevance of Independent Conditioning Variables} \end{aligned} \]
\[ \color{gray} \begin{aligned} \mathop{\mathrm{E}}[ \textcolor[RGB]{239,71,111}{Y_i} \ \textcolor[RGB]{17,138,178}{Y_j} \mid X_1 \ldots X_n] &\overset{\texttip{\small{\unicode{x2753}}}{Here's use the law of iterated iterated expectations to 'break the second stage into a second and third'}}{=} \mathop{\mathrm{E}}[\mathop{\mathrm{E}}[\textcolor[RGB]{239,71,111}{Y_i} \ \textcolor[RGB]{17,138,178}{Y_j} \mid \textcolor[RGB]{239,71,111}{Y_i}, X_1 \ldots X_n] \mid X_1 \ldots X_n] \\ &\overset{\texttip{\small{\unicode{x2753}}}{Here we pull $Y_i$ out of the inner expectation (stage 3). That's justified by linearity of conditional expectations because it's a function of $\textcolor[RGB]{239,71,111}{Y_i}, X_1 \ldots X_n$, the stuff we chose in stages 1 and 2.}}{=} \mathop{\mathrm{E}}[ \textcolor[RGB]{239,71,111}{Y_i} \ \mathop{\mathrm{E}}[\textcolor[RGB]{17,138,178}{Y_j} \mid \textcolor[RGB]{239,71,111}{Y_i}, X_1 \ldots X_n] \mid X_1 \ldots X_n] \\ &\overset{\texttip{\small{\unicode{x2753}}}{Here we drop the irrelevant conditioning variables from the inner expectation. In the notation above, $Y=Y_j$, $X=X_j$, and $X'=(Y_i, X_1 \ldots X_n \text{ except } X_j)$, which describes other 'calls', is independent of $(X,Y)=(X_j, Y_j)$.}}{=} \mathop{\mathrm{E}}[\textcolor[RGB]{239,71,111}{Y_i} \ \mathop{\mathrm{E}}[\textcolor[RGB]{17,138,178}{Y_j} \mid X_j] \mid X_1 \ldots X_n] \\ &\overset{\texttip{\small{\unicode{x2753}}}{Here we pull $\textcolor[RGB]{17,138,178}{Y_j}$ out of the outer expectation (stage 2). That's justified by linearity of conditional expectations because it's a function of $X_1 \ldots X_n$, the stuff we chose in stage 1.}}{=} \textcolor[RGB]{17,138,178}{\mathop{\mathrm{E}}[Y_j \mid X_j]} \ \mathop{\mathrm{E}}[\textcolor[RGB]{239,71,111}{Y_i} \mid X_1 \ldots X_n] \\ &\overset{\texttip{\small{\unicode{x2753}}}{Here we drop the irrelevant conditioning variables from expectation involving $\textcolor[RGB]{239,71,111}{Y_i}$. In the notation above, $Y=Y_i$, $X=X_i$, and $X'=(Y_j, X_1 \ldots X_n \text{ except } X_i)$, which describes other 'calls', is independent of $(X,Y)=(X_i, Y_i)$.}}{=} \textcolor[RGB]{17,138,178}{\mathop{\mathrm{E}}[Y_j \mid X_j]} \ \textcolor[RGB]{239,71,111}{\mathop{\mathrm{E}}[Y_i \mid X_i]} \\ &\overset{\texttip{\small{\unicode{x2753}}}{This is the same thing in different notation.}}{=} \textcolor[RGB]{17,138,178}{\mu(X_j)} \ \textcolor[RGB]{239,71,111}{\mu(X_i)} \end{aligned} \]
Claim. When we sample with replacement, the variance of a subsample mean is the
expected value of the subpopulation variance divided by the number of people in the subsample. \[
\mathop{\mathrm{\mathop{\mathrm{V}}}}[\hat\mu(1)] = \mathop{\mathrm{E}}\qty[ \frac{\sigma^2(1)}{N_1} ] \quad \text{ for } \quad N_1 = \sum_{i=1}^n 1_{=1}(X_i)
\]
\[ \begin{aligned} \mathop{\mathrm{\mathop{\mathrm{V}}}}\qty[ \hat \mu(1) \mid X_1 \ldots X_n ] &\overset{\texttip{\small{\unicode{x2753}}}{Definitionally.}}{=} \mathop{\mathrm{E}}\qty[ \qty{\hat \mu(1) - \mu(1) }^2 \mid X_1 \ldots X_n ] \\ &\overset{\texttip{\small{\unicode{x2753}}}{Also definitionally.}}{=} \mathop{\mathrm{E}}\qty[ \qty{\frac{\sum_{i=1}^n 1_{=1}(X_i)Y_{i}}{\sum_{i=1}^n 1_{=1}(X_i)} - \mu(1)}^2 \mid X_1 \ldots X_n ] \\ &\overset{\texttip{\small{\unicode{x2753}}}{Multiplying $\mu(1)$ by $N_1/N_1$ to get a common denominator}}{=} \mathop{\mathrm{E}}\qty[ \qty{ \frac{\sum_{i=1}^n 1_{=1}(X_i){\textcolor[RGB]{0,0,255}{\qty{Y_{i} - \mu(1)}}}}{\sum_{i=1}^n 1_{=1}(X_i)}}^2 \mid X_1 \ldots X_n ] \\ &\overset{\texttip{\small{\unicode{x2753}}}{The indicator trick tells us the factors we've highlighted in \textcolor[RGB]{0,0,255}{blue} are the same.}}{=} \mathop{\mathrm{E}}\qty[ \qty{ \frac{\sum_{i=1}^n 1_{=1}(X_i)\textcolor[RGB]{0,0,255}{Z_i}}{\sum_{i=1}^n 1_{=1}(X_i)}}^2 \mid X_1 \ldots X_n ] \qfor \textcolor[RGB]{0,0,255}{Z_i = Y_{i} - \mu(X_i)} \\ &\overset{\texttip{\small{\unicode{x2753}}}{Expanding the square.}}{=} \mathop{\mathrm{E}}\qty[ \frac{\textcolor[RGB]{239,71,111}{\sum_{i=1}^n}\textcolor[RGB]{17,138,178}{\sum_{j=1}^n} \textcolor[RGB]{239,71,111}{1_{=1}(X_i)Z_i} \ \textcolor[RGB]{17,138,178}{1_{=1}(X_j)Z_j}}{\qty{\sum_{i=1}^n 1_{=1}(X_i) }^2} \mid X_1 \ldots X_n ] \\ &\overset{\texttip{\small{\unicode{x2753}}}{Distributing the expectation. We can pull out the denominator and the indicator in each term because they're functions of $X_1 \ldots X_n$.}}{=} \frac{\textcolor[RGB]{239,71,111}{\sum_{i=1}^n}\textcolor[RGB]{17,138,178}{\sum_{j=1}^n} \textcolor[RGB]{239,71,111}{1_{=1}(X_i)} \textcolor[RGB]{17,138,178}{1_{=1}(X_j)}\mathop{\mathrm{E}}\qty[\textcolor[RGB]{239,71,111}{Z_i} \textcolor[RGB]{17,138,178}{Z_j} \mid X_1 \ldots X_n] }{\qty{\sum_{i=1}^n 1_{=1}(X_i) }^2} \end{aligned} \]
This random variable \(Z_i\) is like the one we have in the unconditional case, but conditionally.
\[ \begin{aligned} \mathop{\mathrm{E}}[ Z_i \mid X_i ] &= \mathop{\mathrm{E}}[ Y_i - \mu(X_i) \mid X_i ] = \mathop{\mathrm{E}}[ Y_i \mid X_i ] - \mu(X_i) = \mu(X_i) - \mu(X_i) && \overset{\texttip{\small{\unicode{x2753}}}{It has conditional expectation zero. }}{\ } \\ \mathop{\mathrm{E}}[ Z_i^2 \mid X_i] &= \mathop{\mathrm{E}}[ \{ Y_i - \mathop{\mathrm{E}}[Y_i \mid X_i] \}^2 \mid X_i] = \mathop{\mathrm{\mathop{\mathrm{V}}}}[Y_i \mid X_i] && \overset{\texttip{\small{\unicode{x2753}}}{The conditional expectation of its square is the conditional variance of $Y_i$ }}{\ } \end{aligned} \]
\[ \begin{aligned} \mathop{\mathrm{E}}\qty[\textcolor[RGB]{239,71,111}{Z_i}\textcolor[RGB]{17,138,178}{Z_j} \mid X_1 \ldots X_n] &\overset{\texttip{\small{\unicode{x2753}}}{Because $i=j$}}{=} \mathop{\mathrm{E}}\qty[ Z_i^2 \mid X_1 \ldots X_n] \\ &\overset{\texttip{\small{\unicode{x2753}}}{Because $X' = (X_1 \ldots X_n \text{except} X_i)$ is independent of $X_i$ and $Y_i$.}}{=} \mathop{\mathrm{E}}\qty[ Z_i^2 \mid X_i] \\ &\overset{\texttip{\small{\unicode{x2753}}}{Definitionally}}{=}\mathop{\mathrm{\mathop{\mathrm{V}}}}[Y_i \mid X_i] = \sigma^2(X_i) \end{aligned} \]
\[ \begin{aligned} \mathop{\mathrm{E}}\qty[\textcolor[RGB]{239,71,111}{Z_i} \textcolor[RGB]{17,138,178}{Z_j} \mid X_1 \ldots X_n] &\overset{\texttip{\small{\unicode{x2753}}}{This is the factorization identity we proved a few slides back.}}{=}\textcolor[RGB]{239,71,111}{\mathop{\mathrm{E}}\qty[Z_i \mid X_i]} \textcolor[RGB]{17,138,178}{\mathop{\mathrm{E}}\qty[Z_j \mid X_j]} \\ &\overset{\texttip{\small{\unicode{x2753}}}{Definitionally}}{=} 0 \times 0 \end{aligned} \]
\[ \begin{aligned} \mathop{\mathrm{\mathop{\mathrm{V}}}}\qty[ \hat \mu(1) \mid X_1 \ldots X_n ] &\overset{\texttip{\small{\unicode{x2753}}}{Step 1}}{=} \underset{\color[RGB]{64,64,64}\text{within-groups term}}{\mathop{\mathrm{E}}\qty[ \mathop{\mathrm{\mathop{\mathrm{V}}}}\qty{ \hat \mu(1) \mid X_1 \ldots X_n }] } + \underset{\color[RGB]{64,64,64}\text{between-groups term}}{ 0 } \\ &\overset{\texttip{\small{\unicode{x2753}}}{Step 2}}{=} \mathop{\mathrm{E}}\qty[\frac{\textcolor[RGB]{239,71,111}{\sum_{i=1}^n}\textcolor[RGB]{17,138,178}{\sum_{j=1}^n} \textcolor[RGB]{239,71,111}{1_{=1}(X_i)} \textcolor[RGB]{17,138,178}{1_{=1}(X_j)}\mathop{\mathrm{E}}\qty[\textcolor[RGB]{239,71,111}{Z_i} \textcolor[RGB]{17,138,178}{Z_j} \mid X_1 \ldots X_n] }{\qty{\sum_{i=1}^n 1_{=1}(X_i) }^2}] + 0 \\ &\overset{\texttip{\small{\unicode{x2753}}}{Step 3}}{=} \mathop{\mathrm{E}}\qty[\frac{\textcolor[RGB]{239,71,111}{\sum_{i=1}^n 1_{=1}(X_i)} \ \sigma^2(X_i) }{\qty{\sum_{i=1}^n 1_{=1}(X_i) }^2}] \\ &\overset{\texttip{\small{\unicode{x2753}}}{The indicator trick again.}}{=} \mathop{\mathrm{E}}\qty[\frac{\textcolor[RGB]{239,71,111}{\sum_{i=1}^n 1_{=1}(X_i)} \ \sigma^2(1) }{\qty{\sum_{i=1}^n 1_{=1}(X_i) }^2}] \\ &\overset{\texttip{\small{\unicode{x2753}}}{Cancelling a common factor of $N_1$ in the numerator and denominator.}}{=} \mathop{\mathrm{E}}\qty[\frac{\sigma^2(1)}{\sum_{i=1}^n 1_{=1}(X_i) } ] \end{aligned} \]
Claim. The variance of the difference in subsample means is the sum of the variances of the subsample means. \[ \mathop{\mathrm{\mathop{\mathrm{V}}}}[\hat\mu(1) - \hat\mu(0)] = \mathop{\mathrm{E}}\qty[ \frac{\sigma^2(1)}{N_1} + \frac{\sigma^2(0)}{N_0}] \quad \text{ for } \quad N_x = \sum_{i=1}^n 1_{=x}(X_i) \]
This’ll be a Homework Exercise.
\[ \mathop{\mathrm{\mathop{\mathrm{V}}}}[\hat\mu(1) - \hat\mu(0)] = \mathop{\mathrm{E}}\qty[ \frac{\sigma^2(1)}{N_1} + \frac{\sigma^2(0)}{N_0}] \quad \text{ for } \quad N_x = \sum_{i=1}^n 1_{=x}(X_i) \]
\[ \widehat{\mathop{\mathrm{\mathop{\mathrm{V}}}}}[\hat\mu(1) - \hat\mu(0)] = \frac{\hat{\sigma}^2(1)}{N_1} + \frac{\hat{\sigma}^2(0)}{N_0} \qfor \hat \sigma^2(x) = \frac{1}{N_x} \sum_{i:X_i=x} (Y_i - \hat \mu(x))^2 \]
The NSW Experiment
\(x\) | \(N_x\) | \(\hat \mu(x)\) | \(\hat \sigma(x)\) |
---|---|---|---|
0 | 260 | 4600 | 5500 |
1 | 185 | 6300 | 7900 |
\[ \begin{aligned} \mu(1)-\mu(0) &\in 1800 \pm 1300 \approx [500, \ 3100] &&\qqtext{is a 95\% confidence interval in 1978 dollars} \\ &\in 8600 \pm 6300 \approx [2300, \ 14900] &&\qqtext{adjusted for inflation} \end{aligned} \]
\(x\) | \(N_x\) | \(\hat \mu(x)\) | \(\hat \sigma(x)\) |
---|---|---|---|
0 | 260 | 4600 | 5500 |
1 | 185 | 6300 | 7900 |
Warm-Up.
Our interval based on normal approximation should be \(\hat\theta \pm 1.96 \times \hat\sigma_{\hat\theta}\) where \(\hat\theta=\hat\mu(1) - \hat\mu(0)\) is our point estimate and \(\hat\sigma_{\hat\theta}\) is an estimate of its standard deviation. That’s roughly \(\hat\theta \pm 2 \times 670\). Pretty similar to our bootstrap interval. Using the formula from Slide 5.13, we calculate it like this.
\[ \begin{aligned} \widehat{\mathop{\mathrm{\mathop{\mathrm{V}}}}}[\hat\mu(1) - \hat\mu(0)] &= \frac{\hat\sigma^2(1)}{N_1} + \frac{\hat\sigma^2(0)}{N_0} \\ &\approx \frac{61.90M}{185} + \frac{30.07M}{260} \\ &\approx 450.24K \approx \hat\sigma_{\hat\theta}^2 \qfor \hat\sigma_{\hat\theta} \approx 670 \end{aligned} \]
Resizing Part 1.
If we’re resizing our experiment but running as before, we can think of the new subsample sizes as being roughly proportional to the old ones. That is, if our new sample size is \(\alpha n\), then our subsample sizes will be \(\alpha N_1\) and \(\alpha N_0\). Plugging these new sample sizes in our variance formula, we get a simple formula for our our estimator’s standard deviation in this new experiment relative to the old one. It’s just \(\sigma_{\hat\theta}/\sqrt{\alpha}\) where \(\sigma_{\hat\theta}\) is the standard deviation of our estimator in our original experiment.
\[ \begin{aligned} \mathop{\mathrm{\mathop{\mathrm{V}}}}[\hat\mu(1) - \hat\mu(0)] &= \frac{\sigma^2(1)}{\alpha N_1} + \frac{\sigma^2(0)}{\alpha N_0} = \frac{1}{\alpha} \sigma_{\hat\theta}^2 = \qty(\frac{\sigma_{\hat\theta}}{\sqrt{\alpha}})^2 \end{aligned} \]
That is, it’s \(1/\sqrt{\alpha}\) times the quantity we just estimated to be \(\hat\sigma_{\hat\theta} \approx 670\), so it’s sensible to estimate it by \(\hat\sigma_{\hat\theta}/\sqrt{\alpha}\). We want our new interval \(\hat\theta \pm 1.96\hat\sigma_{\hat\theta}/\sqrt{\alpha}\) to be \(\hat\theta \pm 500\). The old one, \(\hat\theta \pm 1.96\hat\sigma_{\hat\theta} \approx \hat\theta \pm 1320\), was about 3 times wider than this, so we want \(\sqrt{\alpha} \approx 3\) and therefore \(\alpha \approx 9\). Since we’ve already got a sample of size \(n\), we can run a ‘second wave’ of size \((\alpha-1)n \approx 8n\) to get a big enough sample.
That’s a back-of-the-envelope calculation. If we want to be a bit more precise, we can actually solve for \(\alpha\) that equates these two interval widths. \[ \begin{aligned} \frac{1.96\hat\sigma_{\hat\theta}}{\sqrt{\alpha}} = 500 \qfor \alpha = \qty(\frac{1.96\hat\sigma_{\hat\theta}}{500})^2 \approx 6.92 \end{aligned} \]
That’s a bit better than 9. It pays to be precise sometimes.
Resizing Part 2. If it’s free to include more participants in our control group, we may as we’ll make \(N_0\) arbitrarily large. If we did, we’d have this variance.
\[ \begin{aligned} \widehat{\mathop{\mathrm{\mathop{\mathrm{V}}}}}[\hat\mu(1) - \hat\mu(0)] &= \frac{\hat\sigma^2(1)}{N_1} + \frac{\hat\sigma^2(0)}{\infty} = \qty(\frac{\hat\sigma(1)}{\sqrt{N_1}})^2 \approx 580^2 \end{aligned} \]
The resulting interval would be \(\pm 1.96 \times 580 \approx \pm 1130\). Roughly twice as wide as we want. So we’re not getting the precision we want for free. Thinking back to Part 1, that means we’d want \(N_1\) to be roughly \(\alpha = 4\) times larger. And that’d mean treating roughly \((\alpha-1) = 3\) more people. That’s a lot cheaper than treating \(8\) (or really \(5.92\)) times as many people, as we would if we ran our second wave as a scaled-up version of the first.
\[ \small{ \begin{aligned} \mathop{\mathrm{\mathop{\mathrm{V}}}}(Y) &= \mathop{\mathrm{E}}\qty[ \qty{ Y - \mathop{\mathrm{E}}(Y) }^2 ] && \text{ the \emph{spread} idea: the mean square of a \emph{centered version} of $Y$ } \\ &= \mathop{\mathrm{E}}\qty(Y^2) - \qty{\mathop{\mathrm{E}}\qty(Y)}^2 && \text{ the \emph{excess} idea: the average amount $Y^2$ \emph{exceeds} the square of its mean}. \end{aligned} } \]
Let’s think, in terms of the plot, about why there is excess in the square.
Claim. The spread and excess formulas for variance are equivalent. \[ \mathop{\mathrm{E}}\qty[ \qty{ Y - \mathop{\mathrm{E}}(Y) }^2 ] =\mathop{\mathrm{E}}[ Y^2] - \qty{\mathop{\mathrm{E}}(Y)}^2 \]
\[ \small{ \begin{aligned} \mathop{\mathrm{E}}\qty[ \qty{ Y - \mathop{\mathrm{E}}(Y) }^2 ] &= \mathop{\mathrm{E}}\qty[ Y^2 - 2 Y \mathop{\mathrm{E}}(Y) + \qty{\mathop{\mathrm{E}}(Y)}^2 ] && \text{ FOIL } \\ &= \mathop{\mathrm{E}}[ Y^2] - \mathop{\mathrm{E}}\qty[2 Y \mathop{\mathrm{E}}(Y)] + \mathop{\mathrm{E}}\qty[\qty{\mathop{\mathrm{E}}(Y)}^2] && \text{Distributing expectations (linearity)} \\ &= \mathop{\mathrm{E}}[ Y^2] - 2\mathop{\mathrm{E}}(Y)\mathop{\mathrm{E}}\qty[Y] + \qty{\mathop{\mathrm{E}}(Y)}^2\mathop{\mathrm{E}}[1] && \text{Pulling constants out of expectations (linearity)} \\ &= \mathop{\mathrm{E}}[ Y^2] - 2\qty{\mathop{\mathrm{E}}(Y)}^2 + \qty{\mathop{\mathrm{E}}(Y)}^2] && \text{Recognizing a product of something with itself as a square} \\ &= \mathop{\mathrm{E}}[ Y^2] - \cancel{2}\qty{\mathop{\mathrm{E}}(Y)}^2 + \cancel{\qty{\mathop{\mathrm{E}}(Y)}^2}] && \text{Subtracting} \end{aligned} } \]
\[ \small{ \begin{aligned} \mathop{\mathrm{\mathop{\mathrm{V}}}}(Y \mid X) &= \mathop{\mathrm{E}}\qty[ \qty{ Y - \mathop{\mathrm{E}}(Y\mid X)}^2 \mid X ] && \text{ the \emph{spread} idea: the mean square of a conditionally centered version of $Y$ } \\ &= \mathop{\mathrm{E}}\qty(Y^2 \mid X) - \qty{\mathop{\mathrm{E}}\qty(Y \mid X)}^2 && \text{ the \emph{excess} idea: the average amount $Y^2$ exceeds the square of its cond. mean} \end{aligned} } \]
Claim. \[ \mathop{\mathrm{\mathop{\mathrm{V}}}}[Y] = \textcolor{blue}{\mathop{\mathrm{E}}\qty[\mathop{\mathrm{\mathop{\mathrm{V}}}}(Y \mid X)]} + \textcolor{green}{\mathop{\mathrm{\mathop{\mathrm{V}}}}\qty[\mathop{\mathrm{E}}(Y \mid X)]} \]
\[ \mathop{\mathrm{\mathop{\mathrm{V}}}}[Y] = \textcolor{blue}{\mathop{\mathrm{E}}\qty[ \mathop{\mathrm{E}}(Y^2 \mid X) - \qty{\mathop{\mathrm{E}}\qty(Y \mid X)}^2]} + \textcolor{green}{\mathop{\mathrm{E}}\qty{ \mathop{\mathrm{E}}(Y \mid X)^2} - \qty(\mathop{\mathrm{E}}\qty{\mathop{\mathrm{E}}\qty(Y \mid X)})^2} \]
\[ \mathop{\mathrm{\mathop{\mathrm{V}}}}[Y] = \textcolor{blue}{\mathop{\mathrm{E}}\qty[ \mathop{\mathrm{E}}(Y^2 \mid X)] - \mathop{\mathrm{E}}\qty[\qty{\mathop{\mathrm{E}}\qty(Y \mid X)}^2]} + \textcolor{green}{\mathop{\mathrm{E}}\qty[ \qty{\mathop{\mathrm{E}}(Y \mid X)^2} ] - \qty(\mathop{\mathrm{E}}\qty{\mathop{\mathrm{E}}\qty(Y \mid X)})^2} \]
\[ \mathop{\mathrm{\mathop{\mathrm{V}}}}[Y] = \textcolor{blue}{\mathop{\mathrm{E}}\qty[ \mathop{\mathrm{E}}(Y^2 \mid X)] \cancel{- \mathop{\mathrm{E}}\qty[\qty{\mathop{\mathrm{E}}\qty(Y \mid X)}^2]}} + \textcolor{green}{\cancel{\mathop{\mathrm{E}}\qty[ \qty{\mathop{\mathrm{E}}(Y \mid X)^2} ]} - \qty(\mathop{\mathrm{E}}\qty{\mathop{\mathrm{E}}\qty(Y \mid X)})^2} \]
\[ \mathop{\mathrm{\mathop{\mathrm{V}}}}[Y] = \textcolor{blue}{E[Y^2] \cancel{- \mathop{\mathrm{E}}\qty[\qty{\mathop{\mathrm{E}}\qty(Y \mid X)}^2]}} + \textcolor{green}{\cancel{\mathop{\mathrm{E}}\qty[ \qty{\mathop{\mathrm{E}}(Y \mid X)^2} ]} - (\mathop{\mathrm{E}}\qty{Y})^2} \]
This is really still called the law of iterated expectations.
We showed that on Slide 3.1.