\documentclass[12pt]{article}
\usepackage{amssymb}
%\usepackage[french]{babel}
%\usepackage[T1]{fontenc}
\usepackage{graphicx}
\usepackage[left=25mm, right=25mm,top=25mm,bottom=25mm]{geometry}
\usepackage{amsmath,amsfonts,amsthm,array,cancel,dsfont,enumerate,multicol}%,enumitem}
\renewcommand{\thefootnote}{(\roman{footnote})}
\usepackage{hyperref,thmbox}%,pdfsync}

%%%% Index
\usepackage{makeidx}
\makeindex

\newcommand{\bbZ}{\mathbb{Z}}
\newcommand{\bbR}{\mathbb{R}}
\newcommand{\bbN}{\mathbb{N}}
\newcommand{\EE}{\mathbb{E}}
\newcommand{\PP}{\mathbb{P}}
\newcommand{\mm}{\vec{m}}
\newcommand{\nn}{\vec{n}}
\newcommand{\pp}{\vec{p}}
\newcommand{\eps}{\varepsilon}
\newcommand{\Zero}{\vec{0}}

% Thormes
\newtheorem{theo}{Theorem}[section]
\newtheorem{prop}[theo]{Proposition}
\newtheorem{lem}{Lemma}

%\renewcommand{\thelem}{\empty{}}
\newtheorem{coro}[theo]{Corollary}
\newtheorem{defi}[theo]{Definition}
\newtheorem{defitheo}[theo]{Definition/Theorem}
\newenvironment{rem}{
\begin{example}[Remark]
}{
\end{example}
}
\newenvironment{exo}{
\begin{example}[Exercise]
}{
\end{example}
}
%\newenvironment{defi}{
%\begin{example}[Definition]
%}{
%\end{example}
%}
%\newenvironment{defitheo}{
%\begin{example}[Definition/Theorem]
%}{
%\end{example}
%}
%\newtheorem*{defi}{Definition}
%\newtheorem*{defitheo}{Definition/Theorem}
%\newtheorem*{rem}{Remark}
%\newtheorem*{exo}{Exercise}
%\newtheorem*{example}{Example}
%\renewcommand{\therem}{\empty{}}

\renewcommand{\l}{\ell}
\newcommand{\Trib}{\mathcal{A}}
\newcommand{\Prob}{\mathbb{P}}
\newcommand{\card}{\mathrm{card}}
\newcommand{\set}[1]{\left\{#1\right\}}
\newcommand{\norme}[1]{\parallel\hspace{-1mm} #1\hspace{-1mm} \parallel}

%\title{Probability Theory}

\begin{document}

%\maketitle

\thispagestyle{empty}
\noindent \textsc{{\large Master QEF 2014-15}}  \hfill \textsc{{\large Hec Paris - \'Ecole Polytechnique}} 

\vfill

\begin{center}
\includegraphics[width=11cm]{Figures/HeadsTails.jpg}

\vfill

\noindent MAP551  - \textsc{{\Large Probability Theory}}% for Financial Economics}}\\
\end{center}

\vfill

\noindent{\large Lucas Gerin} \emph{(D\'epartement de Math\'ematiques Appliqu\'ees, \'Ecole Polytechnique)}\\
\verb|lucas.gerin@polytechnique.edu|

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\newpage
\vspace{2mm}
\tableofcontents
\vfill
(\emph{Front cover picture: }\texttt{www.random.org/coins/}.)
%\vspace{15mm}
%\hrule
%\vspace{15mm}


%First of all, let me introduce myself. I am Lucas Gerin, Professeur Charg\'e de Cours at \'Ecole Polytechnique. I have an e-mail here and my office is just above n.3020 (wing 0, 3rd floor). I will tell you about \emph{Probability Theory}, here is the outline of the course.

%The goal of this course is to give you a rigorous framework in Probability for Finance.
%The first part is about measures and integration theory and is quite abstract. Hopefully, tutorials (by Mathieu Richard) should help you to understand such things.
%The second part is about probability itself, and is definitely more concrete. Well, let's begin with measures! Before going into details, let me give you a short reminder on sets that might be useful for this first part.


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\newpage
\section{Sets and Measures}
The goal of this chapter is to define \emph{probability spaces}, which are basically the sets of outcomes of a random experiment. In particular, we wonder what are the sets that we can compute the probability, such sets will be called \emph{measurable sets}. This approach of probability is due to Kolmogorov (Russia, XXth).
\subsection{$\sigma$-algebras and measures}
Let $S$ be a set (later it will be the set of outcomes of a random experiment) and let $\Trib$ be a collection of subsets (or \emph{events}) of $S$, $\Trib$ is to be understood as the collection of events for which one can compute the probability.
Here is a more formal definition.
\begin{defi}
$\Trib$ is a \emph{$\sigma$-algebra} if the following conditions hold:
\begin{enumerate}[i)]
\item The empty set $\varnothing$ and the entire set $S$ are in $\Trib$.
\item If $A$ is in $\Trib$, then so is $A^c$ (the \emph{complement} of $A$).
\item If $A_1,A_2$ are in $\Trib$, then so are
$A_1\cup A_2$ and $A_1\cap A_2$.
\item More generally, if $A_1,A_2,A_3,\dots \in\Trib$, then
\begin{align*}
\bigcup_{n\geq 1}A_n&=A_1\cup A_2\cup A_3\cup\dots \in\Trib \\
\bigcap_{n\geq 1}A_n&=A_1\cap A_2\cap A_3\cap\dots \in\Trib .
\end{align*}
\end{enumerate}
\end{defi}
\begin{example}
For instance if we take for $S$ the set of outcomes of a dice
$$
S=\set{1,2,3,4,5,6},
$$
here are two examples of $\sigma$-algebras:
\begin{enumerate}
\item The collection of {\bf all subsets} of $S$
$$
\varnothing, S,\set{1},\set{2},\dots,\set{6},\set{1,2},\dots,\set{1,2,3},\dots
$$
is a $\sigma$-algebra. It is denoted by $\mathcal{P}(S)$, and called the \emph{power set}.
\item The collection
$$
\mathcal{A}= \varnothing, S,\set{1,2}
$$
is not a $\sigma$-algebra since $\set{1,2}$ is in the collection but not its complement $\set{3,4,5,6}$. However
$$
\mathcal{A}=\varnothing, S,\set{1,2},\set{3,4,5,6}
$$
is a $\sigma$-algebra.
\item In order to understand item iv), let us imagine what would be the probability space defined by throwing a coin infinitely many times. Let $X_n\in\set{\text{ Heads/Tails }}$ be the $n$-th result. Then each event $\set{X_n=\text{Heads}}$ is in the $\sigma$-algebra and we would like to compute the probability of, for instance,
$$
\bigcap_{n\geq 1}\set{X_n=\text{ Heads }} = \text{"The coin always turns Heads"}.
$$
\end{enumerate}
\end{example}
For any $S$, we could take $\mathcal{P}(S)$, which is a $\sigma$-algebra, but for some technical reasons it contains too many events and in some cases it is not convenient to consider $\mathcal{P}(S)$.

Thus, we need the definition of \emph{generated} $\sigma$-algebra.
\begin{defi}
Let $S$ be a set and $C$ a collection of subsets of $S$. The $\sigma$-algebra generated by $C$ is the smallest $\sigma$-algebra containing $C$. It is denoted by $\sigma(C)$.
\end{defi}
Let's see what it gives on the example of the dice. Let's take for $C$ the single set $\set{1,2}$. What is there in $\sigma(\set{1,2})$?
\begin{itemize}
\item By definition there are $\varnothing$ and $S$.
\item There are $\set{1,2}$ and its complement $\set{3,4,5,6}$ and...
\item that's it since this is a $\sigma$-algebra.
\end{itemize}
Thus the $\sigma$-algebra generated by $\set{1,2}$ is
$$
\sigma(\set{1,2})=\varnothing, S,\set{1,2},\set{3,4,5,6}. 
$$
%You'll see more examples in tutorials.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsubsection*{$\rhd\ $Measures}
A pair $(S,\Trib)$ is called a \emph{measurable space}. Basically, a \emph{measure} gives a \emph{mass} to every set.
\begin{defi}
Let $(S,\Trib)$ be measurable space, a \emph{measure} on $S$ is an application
$$
\mathcal{A}\to [0,+\infty]
$$
(note that $+\infty$ is allowed) such that
\begin{itemize}
\item $\mu(\varnothing)=0$
\item {\bf (Countable additivity)} For every disjoint sets $A_1,A_2,\dots$ in $\Trib$
$$
\mu\left( \bigcup_{n\geq 1}A_n\right)=\sum_{n\geq 1} \mu(A_n).
$$
(This property should remind you of your undergraduate probability courses.) 
\end{itemize}
If furthermore $\mu(S)=1$ then $\mu$ is a \emph{probability measure}, and is usually denoted by $\Prob$.
\end{defi}
A triple $(S,\Trib,\mu)$ is called a \emph{measured} space. Three important examples:
\begin{itemize}
\item The \emph{counting\index{counting measure} measure}, defined on any $(S,\mathcal{A})$, is defined by
$$
\mu(A)=\card(A)=\text{ number of elements in }A.
$$
For instance, on $(\mathbb{N},\mathcal{P}(\mathbb{N}))$, it gives
$$
\mu(\varnothing)=0,\quad \mu(\set{6,9,10})=3,\quad \mu(\mathbb{N})=+\infty.
$$
Of course it is not a probability measure but yet it is useful for us.
\item The \emph{uniform measure} on a finite set $S$ is defined by
$$
\mu(A)=\frac{\card(A)}{\card(S)}.
$$
%it works on any finite set and it is a probability measure.
\item A useful notation is that of the \emph{Dirac\index{Dirac measure/mass} measure} (or \emph{Dirac mass})  at some point $a$, denoted by $\delta_a$. It puts a \emph{mass} one on $a$:
$$
\delta_a(A)=
\begin{cases}
1&\text{ if }a\in A,\\
0&\text{ otherwise.}
\end{cases}
$$
\end{itemize}
\subsubsection*{$\rhd\ $Properties of measures}
Plainly from the definition, we get that
\begin{prop}\label{Prop:Mesures}
Let $(S,\Trib,\mu)$ be a measured space.
\begin{enumerate}
\item If $A\subset B$, then $\mu(A)\leq \mu(B)$.
\item If $\mu(S)<+\infty$,
$$
\mu(A\cup B)=\mu(A)+\mu(B)-\mu(A\cap B),
$$
so that $\mu(A\cup B)\leq \mu(A)+\mu(B)$.
\end{enumerate}
More generally,
\begin{enumerate}
\item[3.] Let $(A_n)_{n\geq 1}$ be any sequence of sets (not necessarily disjoint),
$$
\mu\left( \bigcup_{n\geq 1}A_n\right)\leq \sum_{n\geq 1} \mu(A_n).
$$
\end{enumerate}
\end{prop}
%Let me explain item 2. It is quite clear from a picture: in $\mu(A)+\mu(B)$ you count $\mu(A\cap B)$ twice.
%\begin{center}
%\includegraphics[width=50mm]{Figures/Patates.pdf}
%\end{center}
\begin{exo}
Prove item 3. (Hint: Put $B_n=A_n\setminus\left(A_1\cup A_2\cup\dots A_{n-1}\right)$.)
\end{exo}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Borel sets and the Lebesgue measure} 
Many interesting measures in this course are defined on the real line $\mathbb{R}$, so we need to equip it with a $\sigma$-algebra.
\begin{defi}
The \emph{Borel\index{Borel (set/algebra)} $\sigma$-algebra} on $\mathbb{R}$, denoted by $\mathcal{B}(\mathbb{R})$, is the $\sigma$-algebra generated by all open intervals $(a,b)$ for every $a,b\in\mathbb{R}$. A set in  $\mathcal{B}(\mathbb{R})$ is said to be a \emph{Borel set}.
\end{defi}
What's there in $\mathcal{B}(\mathbb{R})$? Open intervals of course but also
\begin{itemize}
\item closed intervals $[a,b]$, since
$$
[a,b]=\left(\underbrace{(-\infty,a)}_{\text{Borel set}}\cup \underbrace{(b,+\infty)}_{\text{Borel set}}\right)^\text{c}.
$$
\item more complicated sets as $\mathbb{N}$ since it is a countable union of Borel sets:
$$
\mathbb{N}=\bigcup_{k\geq 1}  \underbrace{\set{k}}_{\text{Borel set}}
$$ 
\end{itemize}
In fact, every set you might think of is a Borel set! It is quite difficult (and not very useful for you) to build a set which is not Borel-measurable.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsubsection*{$\rhd\ $The Lebesgue measure}
%Now we introduce the Lebesgue measure, which is the most useful measure on $\mathcal{B}(\mathbb{R})$.
\begin{defitheo}
The Lebesgue\index{Lebesgue measure} measure, usually denoted by $\lambda$, is the {\bf only} measure on $(\mathbb{R},\mathcal{B}(\mathbb{R}))$ such that for any real numbers $a,b$,
$$
\lambda\left( (a,b) \right) = b-a.
$$
\end{defitheo}
For a Borel set $A$, $\lambda(A)$ is interpreted as the \emph{length} of $A$. Let us compute some $\lambda(A)$:
\begin{itemize}
\item For a real $a$ we have, for any integer $n\geq 1$, 
$$
\lambda(\set{a})\leq \lambda\left( (a-1/n,a+1/n) \right)=2/n
$$
and thus $\lambda(\set{a})=0$. This is consistent since $\set{a}$ has no length.
\item For any arbitrary $x$, we have $\lambda(\mathbb{R})\geq \lambda\left( (-x,x) \right)=2x$, thus $\lambda(\mathbb{R})=+\infty$.
\end{itemize}
%Lebesgue's measure is also defined in higher dimension: this is the only measure on $\mathbb{R}^n$ such that
%$$
%\lambda\left( (a_1,b_1)\times(a_2,b_2)\times \dots \times(a_n,b_n) \right)=\prod_{i} (b_i-a_i).
%$$
%This corresponds to volume.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Measurable functions and random variables} 

Let $(S,\Trib)$ be a measurable space and $f$ be a function
$$
\begin{array}{r c c c}
f: & S & \to & \mathbb{R}\\
   & x & \mapsto & f(x).
\end{array}
$$
We say that $f$ is measurable (w.r.t. $\Trib$) if for every Borel set $A$, $f^{-1}(A)$ is in $\Trib$. Recall that the notation $f^{-1}$ stands for
$$
f^{-1}(A)=\set{x\text{ such that }f(x)\in A}.
$$
Let's see some examples:
\begin{itemize}
\item For a finite $S$ with its power set $\mathcal{P}(S)$, every function $f:S\to\mathbb{R}$ is measurable.
\item A function $f:\bbR\to\bbR$ which is measurable w.r.t $\mathcal{B}(\mathbb{R})$ is called a \emph{Borel function}\index{Borel function}.
We admit that every continuous (or piecewise continuous) function is measurable.
In fact, every function you might think of is a Borel function.
\item A very useful example of measurable function is that of \emph{indicator\index{indicator function} functions}. For $A\in\Trib$, the indicator function of $A$ is denoted by $\mathds{1}_A$ and defined by
$$
\begin{array}{r c c l}
\mathds{1}_A: & S & \to & \mathbb{R}\\
   & x & \mapsto & 
\begin{cases}
1&\text{ if }x\in A,\\
0&\text{ otherwise.}
\end{cases}
\end{array}
$$
\end{itemize}

\subsubsection*{$\rhd\ $Probability spaces}
If $\mu$ (or $\mathbb{P}$) is a probability measure, then usually
\begin{itemize}
\item the set $S$ is the set of outcomes and is often denoted by $\Omega$,
\item elements of $\Omega$ are denoted by $\omega$,
\item measurable functions are called \emph{random variables} and denoted with upper-case letters $X,Y,\dots$.
\end{itemize}
 %Now is the time for a very important notion.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsubsection*{$\rhd\ $$\sigma$-algebra generated by a random variable}
\begin{defi}
Let $X:\Omega\to\bbR$ be a random variable defined on a probability space $(\Omega,\Trib,\mu)$. The $\sigma$-algebra generated by $X$, denoted by $\sigma(X)$ is the $\sigma$-algebra generated by sets $X^{-1}(A)$ for all Borel sets.
\end{defi}
As we will see, $\sigma(X)$ is sometimes interpreted as the information carried by $X$. In order to understand what it means, I work out a little example: take $\Omega=\set{1,\dots,6}$ and put
$$
\begin{array}{r c c l}
X: & \Omega & \to & \mathbb{R}\\
   & 1,2,3 & \mapsto & 0,\\
   & 4,5,6 & \mapsto & 1.
\end{array}
$$
In other words, $X(\omega)=\mathds{1}_{\set{4,5,6}}(\omega)$. Which sets are there in $\sigma(X)$ ? There are $\varnothing$ and $\Omega$ as always, but also
$$
X^{-1}(1)=\set{4,5,6}\quad \text{ and }\quad X^{-1}(0)=\set{1,2,3}
$$
and... that's it. Now, what can we say about a random variable which is $\sigma(X)$ measurable? I claim that
$$
Y(1)=Y(2)=Y(3)\qquad \text{ and that} \qquad Y(4)=Y(5)=Y(6).
$$
Indeed, set $Y(1)=a$. Then $Y^{-1}(a)$ contains $1$ and thus $\set{1,2,3}$, hence $Y(1)=Y(2)=Y(3)$. Similarly, $Y(4)=Y(5)=Y(6)=b$ for some $b$. Finally, we can write
$$
\begin{array}{r c c l}
Y: & \Omega & \to & \mathbb{R}\\
   & 1,2,3 & \mapsto & a,\\
   & 4,5,6 & \mapsto & b,
\end{array}
$$
for some $a,b$ which might be equal. We have proved that
$$
Y=a+(b-a)X,
$$
$Y$ is a function of $X$, this is a particular case of the very important statement.
\begin{theo}\label{Th:sigmaY}
A random variable $Y:\Omega\to\bbR$ is measurable with respect to $\sigma(X)$ if and only if there exists a Borel $f$ function such that $Y=f(X)$.
\end{theo}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Limits of sets}
The question we investigate here is, "when do we have $\mu(\lim_n A_n) = \lim_n \mu(A_n)$ ?"
And, by the way, does $\lim_n A_n$ make sense? A first case is when $(A_n)_{n\geq 1}$ is a sequence of \emph{monotone} sets.
\begin{theo}\label{Theo:LimitsSets}
Let $(S,\Trib,\mu)$ be a measured space.
\begin{enumerate}
\item Let $(A_n)_{n\geq 1}$ be an increasing sequence of measurable sets, \emph{i.e.}
$
A_1\subset A_2\subset A_3\subset \dots
$
Then
$$
\mu\left(\bigcup_{n\geq 1} A_n\right)=\lim_{n\to\infty} \nearrow \mu(A_n).
$$
\item Let $(B_n)_{n\geq 1}$ be an decreasing sequence of measurable sets: $B_1\supset B_2\supset B_3\subset \dots$. {\bf Assume furthermore that $\mu$ is a finite measure}, then
$$
\mu\left(\bigcap_{n\geq 1} B_n\right)=\lim_{n\to\infty} \searrow \mu(B_n).
$$
\end{enumerate}
\end{theo}
We really need $\mu$ to be finite for item 2, as shows the following example with the Lebesgue measure:
$$
\lambda\left(\bigcap_{n\geq 1} [n,+\infty)\right)=\lambda(\varnothing)=0\neq +\infty= \lim_{n\to\infty} \searrow \lambda\left([n,+\infty)\right)
$$
\begin{exo}
Prove item 1. of Theorem \ref{Theo:LimitsSets}. (Hint: Set $E_n=A_n\setminus A_{n-1}$.)
\end{exo}
\begin{example}{\bf (A fair coin eventually turns Tails)}.\\
We turn back to the example of a fair coin flipped infinitely many times, we now can prove rigorously that eventually the coin turns Tails.

Let $X_n\in\set{\text{Heads,Tails}}$ be the $n$-th result. We have
$$
\set{\text{ the coin never turns Tails }}=\bigcap_{n\geq 1} \set{X_1=X_2=\dots =X_n=\text{ "Heads" }},
$$
and the sequence $(B_n)_{n\geq 1}$ defined by $B_n=\set{X_1=X_2=\dots =X_n=\text{ "Heads" }}$ is decreasing since obviously
$$
\set{X_1=X_2=\dots =X_n=X_{n+1}=\text{ "Heads" }}\subset \set{X_1=X_2=\dots =X_n=\text{ "Heads" }}
$$
Surely you see why 
$$
\mathbb{P}\left( X_1=X_2=\dots =X_n=\text{ "Heads" } \right)=\frac{1}{2^n}.
$$
Thus item 2. in Theorem \ref{Theo:LimitsSets} says that
\begin{align*}
\Prob\left(\text{ the coin never turns Tails }\right)&=\lim_{n\to +\infty}\Prob \left(X_1=X_2=\dots =X_n=\text{ "Heads" }\right)\\
&= \lim_{n\to +\infty} \frac{1}{2^n}=0.
\end{align*}
Then
$$
\Prob\left(\text{ the coin eventually turns Tails }\right)=1-\Prob\left(\text{ the coin never turns Tails }\right)=1.
$$
\end{example}

\subsubsection*{$\rhd\ $ limsup of events}
Let $(A_n)$ be a sequence of events, we are often interested in "how many of the $A_n$'s occur?". There is a useful notation for that.
Consider the event
\begin{center}
\begin{tabular}{r c l}
"$A_n$ occurs infinitely often" & $=$ & "For any $p$, there is $n\geq p$ such that $A_n$ occurs"\\
                                & $=$ & "For any $p$, $A_p \cup A_{p+1}\cup A_{p+2}\cup\dots$"\\
                                & $=$ & $\bigcap_{p\geq 1} \bigcup_{n\geq p} A_n$
\end{tabular}
\end{center}
This event is denoted by\index{limsup (event)} $\limsup_{n\to +\infty} A_n$.

%Similarly, we consider the event
%\begin{center}
%\begin{tabular}{r c l}
%"$A_n$ occurs for all but finitely $n$" & $=$ & "There exists $p$ such that, for all $n\geq p$, $A_n$ occurs"\\
%                                & $=$ & "There exists $p$, such that $A_p \cap A_{p+1}\cap A_{p+2}\cap\dots$"\\
%                                & $=$ & $\bigcup_{p\geq 1} \bigcap_{n\geq p} A_n$
%\end{tabular}
%\end{center}
%This event is denoted by\index{liminf (event)} $\liminf_{n\to +\infty} A_n$.

Let's see a simple example, you will see another one in tutorials. Take
$$
A_n=
\begin{cases}
&[0,1]\text{ if $n$ is odd},\\
&[0,2]\text{ if $n$ is even}.
\end{cases}
$$
Then it is clear that $\bigcup_{n\geq p} A_n =[0,2]$
for all $p$, and then
$$
\limsup_n A_n= \bigcap_{p\geq 1} \bigcup_{n\geq p} A_n =\bigcap_{p\geq 1}  [0,2]=[0,2].
$$
%Besides, $\bigcap_{n\geq p} A_n =[0,1]$ for all $p$, and then
%$$
%\bigcup_{p\geq 1} \bigcap_{n\geq p} A_n =\bigcup_{p\geq 1}  [0,1]=[0,1].
%$$
%This shows that, in general, $\liminf A_n\neq \limsup A_n$, but we always have
%$$
%\liminf_{n\to +\infty} A_n \subset\limsup_{n\to +\infty} A_n.
%$$
%We will see more of that when we will study convergences of random variables.


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\newpage
\section{Random variables and expectation}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
From now on, we work on a measured space $(\Omega,\Trib,\Prob)$ where $\Prob$ is a probability measure. 
We say that an event $A$ is $\Prob$-\emph{almost sure}\index{almost sure}, or just \emph{almost sure}, if $\Prob(A)=1$.

Elements of $\Omega$ are often denoted by $\omega$. Recall that a random variable $X$ is just a measurable function
$$
X:\Omega\to\mathbb{R}.
$$
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Random variables and their laws}
\begin{defi}
The \emph{law} of $X$, denoted by $\Prob_X$ is the measure on $\mathcal{B}(\mathbb{R})$ such that for any Borel set $A$
$$
\Prob_X(A)=\Prob\left(\set{\omega\text{ such that }X(\omega)\in A}\right)=\mathbb{P}(X\in A).
$$
We write $X\sim \Prob_X$ which reads "$X$ follows the law $\Prob_X$".
\end{defi}
\begin{example} {\bf Fair coin.} Take $\Omega=\set{H,T}$ and
$$
\begin{array}{r r c c}
X: & H & \mapsto & 2\\
   & T & \mapsto & 0
\end{array}
$$
Then
$$
\Prob_X(\set{2})=1/2 = \Prob_X(\set{0}),
$$
and we have $\Prob_X=\tfrac12 \delta_0 +\tfrac12\delta_2$ (while $\Prob=\tfrac12 \delta_H +\tfrac12\delta_T$).
\end{example}
\begin{center}\emph{
Probability laws are Borel measures and, as such, are complicated objects. This is why we prefer to deal with simpler objects: cumulative distribution functions.}
\end{center}
%Recall that if $\mu$ is a probability measure, then to determine $\mu$ it is enough to know $\mu((-\infty,t])$ for any $t$. Then, we set
\begin{defitheo}
The \emph{cumulative\index{cdf|see{cumulative distribution function}}\index{cumulative distribution function} distribution function} (or just distribution function) of $X$ is the function $F_X$ defined by
$$
\begin{array}{r c c c}
F_X: & \mathbb{R} & \to & [0,1]\\
     & t          & \mapsto & \Prob(X\leq t).
\end{array}
$$
If $F_X(t)=F_Y(t)$ for every $t$, then $X$ and $Y$ have the same law.
\end{defitheo}
Some properties of $F_X$:
\begin{itemize}
\item If $s\leq t$, then $\set{X\leq s}\subset \set{X\leq t}$, and so $F_X(s)\leq F_X(t)$. So $F_X$ is (weakly) increasing.
\item $\displaystyle{\lim_{t\to -\infty} F_X(t)=\Prob(\varnothing)=0,\quad \lim_{t\to +\infty} F_X(t)=\Prob(\Omega)=1}$.
\item $F_X$ is right-continuous:
\begin{align*}
\lim_{n\to+\infty} F_X(t+1/n)&= \lim_{n\to+\infty} \Prob(X\leq t+1/n)\\
                             &= \Prob(\cap_{n\geq 1} \set{X\leq t+1/n})\qquad \text{(by Theorem \ref{Theo:LimitsSets})}\\
                             &= \Prob(X\leq t)=F_X(t).
\end{align*}
\end{itemize}

\begin{example} {\bf Fair coin cont'd}
\begin{center}
\includegraphics[width=75mm]{Figures/pdf.pdf}
\end{center}
\end{example} 
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsubsection*{$\rhd\ $Examples of laws: discrete random variables}
We say that $X$ is \emph{discrete} if $X$ takes its values in a finite or countable set.
\begin{itemize}
\item Bernoulli distribution with parameter $p\in[0,1]$:
$$
\Prob(X=1)=p,\qquad \Prob(X=0)=1-p.
$$
\item Binomial\index{distributions!binomial} distribution with parameters $n\geq 1, p\in[0,1]$ = number of successes in $n$ Bernoulli trials:
$$
\Prob(X=k)=\binom{n}{k}p^k(1-p)^{n-k} \text{ for }k=0,1,\dots,n.
$$
\item Geometric\index{distributions!geometric} distribution with parameter $p\in[0,1]$ = first success in Bernoulli trials:
$$
\Prob(X=k)=(1-p)^{k-1}p  \text{ for }k=1,2,\dots
$$
\item Poisson\index{distributions!Poisson} distribution with parameter $\lambda >0$:
$$
\Prob(X=k)=e^{-\lambda} \frac{\lambda^k}{k!}\text{ for }k=0,1,2,\dots
$$
\end{itemize}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsubsection*{$\rhd\ $Examples of laws: continuous random variables}
We say that $X$ is \emph{continuous} if there exists a non-negative and Borel function $f$ such that
$$
\Prob(X\in A)=\int_A f(x)dx,.
$$
The function $f$ is the \emph{density}\index{density} of $X$. Of course $\int_\bbR f(x)dx=\Prob(X\in \bbR)=1$.
\begin{itemize}
\item Uniform distribution on $[a,b]$:
$$
f(x)=\frac{1}{b-a}\mathds{1}_{[a,b]}(x).
$$
\item Exponential\index{distributions!exponential} distribution $\mathcal{E}(\lambda)$ with parameter $\lambda >0$:
$$
f(x)=\lambda \exp(-\lambda x) \mathds{1}_{x \geq 0}.
$$
\item Normal\index{distributions!normal|see{gaussian}} distribution (or gaussian\index{distributions!gaussian} distribution) with parameters $\mu\in\bbR$, $\sigma^2 >0$:

\begin{center}
\begin{tabular}{m{70mm} m{65mm}}
$f(x)=\frac{1}{\sigma\sqrt{2\pi}} \exp\left(-\frac{(x-\mu)^2}{2\sigma^2} \right)$.
&
\includegraphics[width=40mm]{Figures/GaussienneBrut_mu.pdf}
\end{tabular}
\end{center}
\item Pareto distribution with parameters $a,k$:
$$
f(x)=k\frac{a^k}{x^{k+1}}\text{ for }x\geq a.
$$
(Pareto laws are used to describe distribution of income in a population).
\end{itemize}
\subsubsection*{$\rhd\ $cumulative distribution functions and densities}
If $X$ is continuous, then $F_X(t)=\int_{-\infty}^t f(x)dx$ and thus
$$
F_X'(t)=f(t).
$$
\begin{example}
Let $X$ follow the uniform distribution in $[0,1]$. What is the law of $-\log (X)$? Let us compute its cdf: for $t>0$,
\begin{align*}
F(t)=\Prob(-\log(X)\leq t)&=\Prob(X\geq \exp(-t))\\
                     &=\int_{\exp(-t)}^1 dx=1-\exp(-t).
\end{align*}
But then we have the density of $-\log(X)$:
$$
F'(t)=\exp(-t)\mathds{1}_{t>0},
$$
this proves that $-\log(X)$ follows the exponential distribution with parameter $1$.
\end{example}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Abstract expectation}%\label{Sec:Expectation}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%\subsubsection*{$\rhd\ $Definition}
Let $X$ be {\bf non-negative} random variable defined on a probability space $(\Omega,\mathcal{A},\Prob)$.
%\begin{defi}
%A measure $\mu$ is said to be \emph{$\sigma$-finite} if it is finite or if there exists a sequence of measurables sets %$S_1,S_2,\dots$ such that $S=\cup_{n\geq 1}S_n$ and
%$\mu(S_n)<+\infty$ for every $n$.
%\end{defi}
%Let $(S,\Trib,\mu)$ be a measured space. We want to define an integral w.r.t. $\mu$.
\begin{defitheo}
The \emph{expectation} $\EE[X]$ of $X$ is a real number in $[0,+\infty) \cup \set{+\infty}$ with properties
\begin{itemize}
\item If $X=\mathds{1}_A$ then 
$$
\mathbb{E}[X]=\mathbb{E}[\mathds{1}_A]= \mathbb{P}(A).
$$
\item {\bf (Linearity)} For any real numbers $a,b$,
$$
\mathbb{E}[aX+bY]=a\mathbb{E}[X]+b\mathbb{E}[Y].
$$
\item {\bf (Monotonicity)} If $X(\omega)\leq Y(\omega)$ for every $\omega$, then $\mathbb{E}[X]\leq \mathbb{E}[Y]$.
\item {\bf (Monotone convergence)} If for each $\omega\in \Omega$, $X_n(\omega)\nearrow X(\omega)$ then $X$ is measurable and
$$
\mathbb{E}[X]=\mathbb{E}[\lim X_n]=\lim \mathbb{E}[X_n].
$$
\end{itemize}
\end{defitheo}
\begin{rem}
From a more theoretical point of view, the expectation $\EE[X]$ is constructed as the integral of function $\omega\mapsto X(\omega)$ with respect to the measure $\Prob$. This is the theory of \emph{Lebesgue integration} (or \emph{abstract integration}) and goes much beyond the scope of this course. Yet, it explains why you also can find the notation
$$
\EE[X]=\int_\Omega X(\omega)d\Prob(\omega),
$$
where $\int_\Omega$ is an abstract integral (recall that $\Omega$ is not an interval of $\bbR$!) and "$d\Prob$" reads "with respect to $\Prob$".
\end{rem}
We now see some important examples.
\subsubsection*{$\rhd\ $Expectation of a discrete random variable}
Let $\Omega$ be some finite or countable space $\set{\omega_1,\omega_2,\dots}$ and $X$ a non-negative random variable defined on $\Omega$. Then we can write
$$
X(\omega)=X(\omega_1)\times \mathds{1}_{\omega=\omega_1} +X(\omega_1)\times \mathds{1}_{\omega=\omega_2}+\dots 
$$
and thus
\begin{align*}
\mathbb{E}[X]&=\mathbb{E}[X(\omega_1)\times \mathds{1}_{\omega_1}] + \mathbb{E}[X(\omega_2)\times \mathds{1}_{\omega_2}]+\dots\\
      &= X(\omega_1)\times \mathbb{E}[\mathds{1}_{\omega_1}] + X(\omega_2)\times \mathbb{E}[\mathds{1}_{\omega_2}]+\dots\\
      &= X(\omega_1)\times \Prob(\omega_1) + 2\times \Prob(\omega_2)+\dots\\
      &= \sum_{\omega\in\Omega} X(\omega)\Prob(X=\omega).
\end{align*}
and this coincides with the usual notion of expectation (or average).
\subsubsection*{$\rhd\ $Expectation of a continuous random variable}
Assume that $X$ has density $f$ on $\mathbb{R}_+$. We won't get into mathematical details but we always can approach such an $f$ by below with an increasing sequence of staircase functions $g$'s:
\begin{center}
\includegraphics[width=65mm]{Figures/Staircase.pdf}
\end{center}

By taking $g$'s closer and closer to $f$, then we approach the area below the curve of $f$. If $f$ is continuous and integrable in the Riemann sense (the usual integral that you already know), then
$$
\mathbb{E}[X]=\int_\mathbb{R_+} xf(x)dx.
$$

\begin{center}\emph{
Of course there are many theorems hidden behind this loose picture.\\% but these are main ideas.\\
I hope that you begin to understand the strength of this abstract point of view of probability: it covers both theory of discrete and continuous variables.}
\end{center}

\subsubsection*{$\rhd\ $Expectations of random variables of any sign}

Now, how to define $\mathbb{E}[X]$ when $X$ has arbitrary sign? Set
$$
X^+=\begin{cases}
X&\text{ if }X\geq 0,\\ 0&\text{ otherwise}.\end{cases}
\qquad
X^-=\begin{cases}0&\text{ if }X\geq 0,\\ -X&\text{ otherwise}.\end{cases}
$$
Then we can write $X=X^+-X^-$ (note also that $|X|=X^+ -X^-$)
%\begin{center}
%\includegraphics[width=105mm]{Figures/fplusfmoins.pdf}
%\end{center}
and, if $\mathbb{E}[X^+]<+\infty$ and $\mathbb{E}[X^-]<+\infty$ (those expectations are well-defined since $X^+,X^-$ are non-negative random variables) then we set
$$
\mathbb{E}[X]= \mathbb{E}[X^+]-\mathbb{E}[X^-].
$$
Note that since $X\leq |X|$, then $\mathbb{E}[X]\leq \mathbb{E}[|X|]$ and thus:
$$
\left|\mathbb{E}[X]\right|\leq \mathbb{E}[|X|].
$$
One says that $X:\Omega\to\bbR$ is \emph{integrable}\index{integrable r.v.} if $\mathbb{E}[|X|]<+\infty$.

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Swapping $\EE$ and limit}

Our main concern here is:
\begin{center}\emph{
When can we swap expectation and limit: when is $\EE [\lim_n X_n]$ equal to $\lim_n \EE [X_n]$ ?.}
\end{center}
In the definition of the expectation we already saw the
\begin{theo}[Monotone\index{monotone convergence Theorem} convergence]
Let $(X_n)$ be a sequence of non-negative measurable random variables. Assume that $(X_n)$ is $\nearrow$: for all $\omega$, $(X_n(\omega))_{n\geq 1}$ is increasing. Then $X=\lim_n X_n$ is measurable and
$$
\mathbb{E}[X]=\mathbb{E}[\lim_{n\to+\infty} X_n]=\lim_{n\to+\infty} \mathbb{E}[X_n].
$$
\end{theo}
\begin{example}
Imagine that $T$ is the random variable given the first time some event occurs (the first "Heads" in a flip coin for instance). Then $\min\set{T,n}=T$ if the event occurs before $n$, $\min\set{T,n}=n$ otherwise. It is clear that $\left(\min\set{T,n}\right)_{n\geq 1}$ is an increasing sequence and that it goes to $T$ so we have
$$
\EE[\min\set{T,n}] \stackrel{n\to +\infty}{\to} \EE[T],
$$
which is often useful when studying random processes.
\end{example}

\newpage
A VERY important application of monotone convergence:
\begin{prop}[Swapping $\sum$ and $\EE$]
Let $(X_k)$ be a sequence of non-negative random variables. Then $\sum_{k=0}^n X_k \nearrow \sum_{k=0}^\infty X_k$ and then
$$
\EE \left[\sum_{k=0}^\infty X_k \right]= \sum_{k=0}^\infty \EE[ X_k].
$$
\end{prop}

If we want to deal with arbitrary sequences, we need another assumption: \emph{domination}. 

Before going into details, let us begin by a definition. 
We say that $(X_n)_{n\geq 1}$ converges to $X$ \emph{almost surely} if
$$
\Prob\left(X_n\stackrel{n\to\infty}{\longrightarrow} X\right)=\Prob\left(\omega\text{ such that }X_n(\omega)\stackrel{n\to\infty}{\longrightarrow} X(\omega) \right) =1.
$$

\begin{theo}[Dominated-convergence\index{dominated-convergence Theorem} Theorem]
Assume that $(X_n)$ converges to $X$ almost surely. Assume also that all $X_n$'s are \emph{dominated} by $Y$: for all $n\geq 1$ and $\omega\in \Omega$,
$$
|X_n(\omega)|\leq |Y(\omega)|
$$
where $Y$ is integrable: $\EE[|Y|]<+\infty$.
Then
$$
\mathbb{E}[\lim_{n\to+\infty} X_n]=\lim_{n\to+\infty} \mathbb{E}[X_n].
$$
\end{theo}
\begin{rem} If fact, with these assumptions, we even have a stronger result:
$$
\mathbb{E}[|X_n-X|] \to 0.
$$
\end{rem}
We now give an useful result which turns to be an application of the dominated-convergence Theorem.
\begin{theo}[Differentiating inside expectations]\label{Theo:DeriveeEsperance}
Let $I$ be an interval, and
$$
\begin{array}{r r c l}
f: & I\times \bbR  & \to    & \mathbb{R}\\
   & (t,X)      &\mapsto & f(t,X)
\end{array}
$$
Assume that for every $t\in I$, the random variable $x\mapsto f(t,X)$ is integrable and that
$
\left|\frac{\partial}{\partial t}f(t,X)\right| \leq g(X)
$
with $\EE[g(X)]<+\infty$, then
$$
\frac{\partial}{\partial t} \EE[f(t,X)] = \EE\left[ \frac{\partial}{\partial t} f(t,X)\right].
$$
\end{theo}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%\subsection{Changes of measure and Radon-Nikodym derivative}
%Integration theory gives us a new way to construct measures, let's see how. Let $\mu$ be a mesure and $f$ be a non-negative measurable function, $f$ defines a new measure $f\mu$ in the following way:
%$$
%f\mu(A)=\int_A fd\mu.
%$$
%It is clear that $f\mu(\varnothing)=0$ and one also can check the other properties of a measure. If $g$ is another measurable function, we have
%$$
%\int gd(f\mu) = \int g fd\mu
%$$
%and therefore one writes
%$$
%f=\frac{d(f\mu)}{d\mu},
%$$
%one says that $f$ is the \emph{Radon-Nikodym derivative} or \emph{density} of $f\mu$ with respect to $\mu$.
%\begin{example}
%If we take $\mu=\lambda$= the Lebesgue measure over $\mathbb{R}_+$ and $f(x)=\exp(-x)$ then
%$$
%f\lambda([a,b])=\int_a^b \exp(-x)dx=\exp(-a)-\exp(-b).
%$$
%It is a probability measure since $f\lambda([0,+\infty)=\exp(0)-\exp(-\infty)=1$.
%\end{example}
%It is interesting to note that
%$$
%\mu(A)=0 \Rightarrow f\mu(A)=0.
%$$
%One says that $f\mu$ is {\bf absolutely continuous} with respect to $\mu$.
%And in fact, the converse also holds. 
%\begin{theo}[The Radon-Nikodym Theorem]
%Let $\nu,\mu$ be two $\sigma$-finite measures on a measurable space $(S,\Trib)$, and assume that $\nu$ is absolutely continuous w.r.t $\mu$. Then there exists a non-negative and measurable function $f$ such that
%$$
%\nu=f\mu.
%$$
%\end{theo}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Properties of expectations}

\begin{prop}[Expectation of a function of $X$]\ 
Let $\phi$ be a Borel function $\mathbb{R}\to\bbR$,
\begin{itemize}
\item If $X$ has density $f$, then $\mathbb{E}[\phi(X)]=\int_{\mathbb{R}} \phi(x) f(x) dx$.
\item If $X$ is discrete, then $\mathbb{E}[\phi(X)]=\sum_{k} \phi(k)\mathbb{P}(X=k)$,
where the sums runs over all the possible values for $X$: $\bbN,\bbZ,\dots$
\end{itemize}
\end{prop}

\begin{example}
Let $X$ follow the uniform distribution in $[0,1]$,
$$
\mathbb{E}[X^2]=\int x^2\times \mathds{1}_{[0,1]}(x) dx=\int_{[0,1]} x^2 dx=1/3.
$$
\end{example}
\begin{exo}
Let $X\sim\mathcal{E}(1)$. Prove by induction that $\mathbb{E}[X^n]=n!$.
\end{exo}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsubsection*{$\rhd\ $Inequalities on expectations}
You already know that $\EE[|X|]\geq |\EE[X]|$. This is in fact a particular case of the following theorem:
\begin{theo}[Jensen's\index{Jensen's inequality} inequality]
Let $\phi:\bbR\to\bbR$ be a convex function and $X$ be an integrable r.v. Then
$$
\EE\left[\phi(X)\right] \geq \phi\left( \EE[X] \right).
$$ 
\end{theo}
For instance we have $\EE[X^2]\geq \EE[X]^2, \EE[e^X]\geq e^{\EE[X]},\dots$

\begin{proof}
\begin{multicols}{2}

\ 

\vspace{1cm}

\noindent For simplicity we assume that $\phi$ is differentiable. Consider its curve, by convexity the tangent line at point $\mathbb{E}[X]$ is below the curve: for all real $x$ (see the picture on the right)
$$
\phi(x)\geq \phi(\EE[X])+\phi'(\EE[X])(x-\EE[X]).
$$

\begin{center}
\includegraphics[width=85mm]{Figures/Jensen.pdf}
\end{center}
\end{multicols}
In particular, this inequality is true for the real number $X$. By taking expectation,
\begin{align*}
\EE[\phi(X)]&\geq \EE\bigg[\phi(\EE[X])+\phi'(\EE[X])(X-\EE[X])\bigg]\\
&\geq \EE[\phi(\EE[X])]+\phi'(\EE[X])\EE\left[X-\EE[X]\right]= \phi(\EE[X])+0,
\end{align*}
since $\phi(\EE[X])$ is a constant, and $\EE\left[X-\EE[X]\right]=\EE[X]-\EE[X]=0$.

\end{proof}

An important notion (which most of you already know) is that of variance. First note that if $X$ such that $\EE[X^2]<+\infty$ then, since $|X|\leq 1+X^2$, we have $\EE[|X|]<+\infty$, and then $\EE[X]$ is well-defined.
\begin{defi}
Let $X$ be a random variable such that $\EE[X^2]<+\infty$. The \emph{variance} of $X$ is defined by
$$
\mathrm{Var}(X)=\EE\left[(X-\EE[X])^2\right].
$$
It tells you how much $X$ deviates from its mean.
\end{defi}

\medskip

\noindent Let us expand what's inside the expectation:
\begin{align*}
\EE\left[(X-\EE[X])^2\right]&=\EE\left[ X^2+\EE[X]^2-2X\EE[X]\right]\\
                            &=\EE[X^2]+\EE\left[\EE[X]^2\right]-2\EE\left[X\EE[X]\right] \text{ (by linearity)}\\
                            &=\EE[X^2]+\EE[X]^2-2\EE[X]^2\\
			    &=\EE[X^2]-\EE[X]^2.
\end{align*}
Note also that, plainly from the definition, we have for all constants $a,b$
\begin{equation}\label{Eq:PropVariance}\tag{$\$$}
\mathrm{Var}(aX+b)=\mathrm{Var}(aX)=a^2\mathrm{Var}(X).
\end{equation}
We now can state two important inequalities that estimate the probability that $X$ deviates from its mean:
\begin{theo} Let $X$ be an integrable random variable.
\begin{itemize}
\item {\bf Markov's inequality.} If $X$ is non-negative and $c>0$ is a constant,
\begin{center}
\begin{tabular}{m{55mm} m{63mm}}
$$
\Prob(X\geq c)\leq \frac{\EE[X]}{c}.
$$
&
\includegraphics[width=62mm]{Figures/Markov.pdf}
\end{tabular}
\end{center}
\item {\bf Chebyshev's\index{Chebyshev's inequality} inequality.} If $\mathrm{Var}(X)<+\infty$ and $a>0$ is a constant,
\begin{center}
\begin{tabular}{m{60mm} m{80mm}}
$$
\Prob\left(|X-\EE[X]|\geq a\right)\leq \frac{\mathrm{Var}(X)}{a^2}.
$$
&
\includegraphics[width=79mm]{Figures/Chebyshev.pdf}
\end{tabular}
\end{center}
\end{itemize}
\end{theo}
\begin{proof}\noindent{\bf Markov's inequality.} 
\begin{align*}
1&\geq \mathds{1}_{X\geq c}\\
X&\geq X\mathds{1}_{X\geq c}\text{ (since $X\geq 0$)}\\
\EE[X]&\geq \EE\left[X\mathds{1}_{X\geq c}\right]\geq c\EE\left[\mathds{1}_{X\geq c}\right],
\end{align*}
and now remember that $\EE[\mathds{1}_{X\geq c}]$ is just $\Prob(X\geq c)$.

\noindent{\bf Chebyshev's inequality.} First note that $\set{|X-\EE[X]|\geq a}$ and $\set{(X-\EE[X])^2\geq a^2}$ both denote the same event, so that
$$
\Prob\left(|X-\EE[X]|\geq a\right)= \Prob\left((X-\EE[X])^2\geq a^2\right).
$$
Now, $(X-\EE[X])^2$ is a non-negative random variable. So by Markov's inequality
$$
\Prob\left(|X-\EE[X]|\geq a\right)\leq \frac{\EE\left[(X-\EE[X])^2\right]}{a^2}= \frac{\mathrm{Var}(X)}{a^2}.
$$
\end{proof}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Characterization of laws} 
To prove that $X,Y$ have the same law, we have already seen that it is enough to prove that they have the same cdf. We will see two others criteria, here is the first one:
\begin{theo}
If $\EE[\phi(X)]=\EE[\phi(Y)]$ for every bounded and continuous function $\phi$, then $X$ and $Y$ have the same law.
\end{theo}
\begin{example} {\bf ($\mathbf{ -\log (\mathrm{Unif})}$ cont'd)} Let us use this criterion to show that if $X$ is uniform in $[0,1]$ then $-\log(X)\sim\mathcal{E}(1)$.
Let us compute
$$
\EE[\phi(-\log X)]= \int \phi(-\log(x)) \underbrace{\mathds{1}_{[0,1]}(x)}_{\text{ density of }X} dx= \int_{0}^1 \phi(-\log(x)) dx.
$$
Make the change of variables
$t=-\log(x)$, $x=\exp(-t)$, $\frac{dx}{dt}=-\exp(-t)$. We get
$$
\EE[\phi(-\log X)]= \int_{+\infty}^0 \phi(t)(-\exp(-t))dt=\int_0^{+\infty} \phi(t)\exp(-t)dt=\EE[\phi(\mathcal{E})]
$$
where $\mathcal{E}$ follows the exponential distribution. This holds for any bounded and continuous $\phi$, thus $-\log(X)\sim\mathcal{E}$.
\end{example}
Here is another convenient criterion: the \emph{characteristic function}\index{cf|see{characteristic function}} (also called \emph{Fourier transform}).
\begin{defitheo} The \emph{characteristic\index{Fourier transform|see{characteristic function}}\index{characteristic function} function} of a random variable $X$ is the function $\Phi_X(t)$:
$$
\begin{array}{r c c c}
\Phi_X(t): & \bbR & \to & \mathbb{C}\\
           & t    & \mapsto & \EE[e^{itX}],
\end{array}
$$
where $i$ is the complex number $i^2=-1$. If $\Phi_X(t)=\Phi_Y(t)$ for all $t$, then $X$ and $Y$ have the same law.
\end{defitheo}
\emph{(All you need to know about the exponential of a complex number is that the power rule $e^{z+z'}=e^ze^{z'}$ also holds for complex numbers and that $|e^{itx}|=1$.)}
\begin{example} Let $X$ have the exponential distribution with parameter $1$. Then
\begin{align*}
\EE[e^{itX}]&=\int_0^{+\infty} e^{itx}e^{-x}dx\\
            &=\int_0^{+\infty} e^{x(it-1)}dx=\left[\frac{e^{x(it-1)}}{it-1}\right]_0^{+\infty}\\
            &=\frac{1}{it-1} \left(\lim_{x\to +\infty} e^{x(it-1)} -1 \right)\\
            &=\frac{1}{it-1} \left(\lim_{x\to +\infty} \underbrace{e^{xit}}_{\text{ of modulus }1} \times\underbrace{e^{-x}}_{\to 0} -1 \right)=\frac{1}{1-it}.
\end{align*}
\end{example}
Plainly from the definition, we have some interesting properties of the characteristic function.
\begin{prop}
\begin{itemize}
\item $\Phi_X(0)=\EE[e^{0}]=1$.
\item $|\Phi_X(t)|=|\EE[e^{itX}]|\leq \EE[|e^{itX}|]=1$.
\item Assume that $\EE[|X|]<+\infty$, then% one can swap integral and derivative:
$$
\frac{\partial}{\partial t}\Phi_X(t)= \EE\left[\frac{\partial}{\partial t} e^{itX}\right]=\EE[iX e^{itX}].
$$
In particular, $\Phi_X'(0)=i\EE[X]$.%This is a very simple way to derive the expectation of $X$ (once you got its characteric function of course!).
\end{itemize}
\end{prop}

A VERY important example is:
\begin{prop}[Characteristic\index{characteristic function!of the gaussian distribution} function of the gaussian distribution]\label{Prop:CFGauss}\ \\
If $X\sim\mathcal{N}(\mu,\sigma^2)$ then 
$$\Phi_X(t)=\exp(it\mu -t^2\sigma^2/2).
$$
\end{prop}
(You can skip the proof.)
\begin{proof}
First of all, let us prove the claimed result if $X\sim\mathcal{N}(0,1)$, in this case we need to prove that $\Phi_X(t)=\exp(-t^2/2)$.
Let us differentiate $\Phi_X$:
\begin{align*}
\frac{\partial}{\partial t}\Phi_X(t)&= \EE\left[\frac{\partial}{\partial t} e^{itX}\right]\qquad \text{(using Theorem \ref{Theo:DeriveeEsperance})}\\
\Phi_X'(t)&= \int_\bbR \frac{\partial}{\partial t} e^{itx} \frac{\exp(-x^2/2)}{\sqrt{2\pi}}dx\\
&= \int_\bbR ix e^{itx} \frac{\exp(-x^2/2)}{\sqrt{2\pi}}dx\\
&=  \frac{i}{\sqrt{2\pi}} \int_\bbR \underbrace{e^{itx}}_{v} \underbrace{x\exp(-x^2/2)}_{u'} dx
=  \frac{i}{\sqrt{2\pi}}\left( \cancel{[uv]_{-\infty}^{+\infty}} -\int_{-\infty}^{+\infty} uv' \right) \\
&=  \frac{i}{\sqrt{2\pi}}\int_{-\infty}^{+\infty} \exp(-x^2/2) ite^{itx}\\
&= -t\Phi_X(t).
\end{align*}
Thus we have to solve the differential equation $\frac{\Phi_X'(t)}{\Phi_X(t)}=-t$, which is equivalent to
$$
\left(\log(\Phi_X(t))\right)'=-t.
$$
Hence $\log(\Phi_X(t))=-t^2/2+c$, \emph{i.e.}
$\Phi_X(t)=\exp(-t^2/2)e^c$,
but $\Phi_X(t)=0$, and thus $e^c=1$.

We turn to the general case where $X\sim\mathcal{N}(\mu,\sigma^2)$. We can write
$$
X=\mu +\sigma\times Z,
$$
where $Z\sim\mathcal{N}(0,1)$. Now,
$$
\EE\left[e^{itX}\right]=\EE\left[e^{it\mu+it\sigma Z}\right]
=e^{it\mu}\Phi_Z(t\sigma)=\exp\left(it\mu-\frac{t^2\sigma^2}{2}\right).
$$
\end{proof}
For a discrete random variable $X$, it is more usual to deal with the \emph{generating\index{generating function} function} $G_X(z)$ defined by
$$
G_X(z)=\mathbb{E}[z^X]=\sum_{k\geq 0}\mathbb{P}(X=k)z^k.
$$
As for characteristic functions, $X,Y$ have the same law if $G_X(z)=G_Y(z)$ for any $z$.

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{$L^p$\index{$L^p$ spaces} spaces}
\begin{defi}
Let $(\Omega,\Trib,\Prob)$ be a probability space, and let $p\geq 1$ be a real number. We denote by $L^p(\Omega,\Trib,\Prob)$ (or just $L^p$ if there is no ambiguity) the set of random variables $X$ such that $\EE[|X|^p]<+\infty$. In this case, we define the \emph{$L^p$ norm} of $X$ as
$$
\norme{X}_p=\EE\left[|X|^p\right]^{1/p}.
$$
\end{defi}
Note that in the definition, $p$ is any real number in $[1,+\infty)$ but in practice we often consider integer values of $p$: $L^1,L^2,\dots$
\begin{example}
\begin{itemize}
\item If $X\leq c$, then $\EE[|X|^p]\leq c^p<+\infty$. Then bounded r.v. are in all $L^p$'s.
\item We saw that if $X$ follows the exponential distribution then $\EE[X^p]=p!<+\infty$. Then $X$ also belongs to all $L^p$'s.
\end{itemize}
\end{example}

\noindent $L^p$ is a \emph{vector space}, meaning that if $X,Y\in L^p$ and $a\in\bbR$, then
\begin{itemize}
\item $aX$ is in $L^p$,
\item $X+Y$ is in $L^p$.
\end{itemize}
It is not so easy to check that $\EE[|X+Y|^p]$ is finite. To prove so, let us observe that for any real numbers $x,y$,
$$
|x+y|^p \leq |2\max \set{x,y}|^p\leq 2^p(|x|^p+|y|^p),
$$
and then, by taking expectations of both sides,
$$
\EE[|X+Y|^p] \leq 2^p(\EE[|X|^p]+\EE[|Y|^p]),
$$
which is finite. We admit in these notes that $X\mapsto \norme{X}_p$ is indeed a norm, that is to say:
\begin{itemize}
\item $\norme{aX}_p=|a|\norme{X}_p$,
\item $\norme{X}_p=0$ if and only if $X=0$ $\Prob$-almost surely,
\item {\bf Triangle inequality:} $\norme{X+Y}_p\leq \norme{X}_p+\norme{Y}_p$.
\end{itemize}
A very important property of $L^p$ spaces is that there are included into each others.
\begin{theo}\label{Th:LpInclus}
We always have
$$
\dots L^p \subset L^{p-1} \subset L^{p-2} \subset \dots \subset L^2 \subset L^1.
$$
\end{theo}

\begin{proof}
We will prove that for $p<q$ we have $\norme{X}_p\leq \norme{X}_q$. Then, if $\norme{X}_q$ is finite, so is $\norme{X}_p$.
Sine $q/p >1$ the map $x\mapsto x^{q/p}$ is convex on $\bbR_+$ and thus, if we apply Jensen's inequality to the r.v. $|X|^p$ we obtain
\begin{align*}
\EE\left[\left(|X|^p\right)^{q/p}\right]&\geq \left(\EE\left[|X|^p \right]\right)^{q/p}\\
\text{power }1/q\hookrightarrow \qquad \EE\left[|X|^q\right]^{1/q}&\geq \left(\EE\left[|X|^p \right]\right)^{1/p}\\
\norme{X}_q&\geq \norme{X}_p.
\end{align*}
\end{proof}

\begin{defi}[$L^p$\index{$L^p$ convergence} convergence] Let $(X_n)_{n\geq 0}$ be a sequence of random variables, one says that $X_n$ \emph{converges to $X$ in $L^p$} if
$\EE\left[|X_n -X|^p\right]\stackrel{n\to \infty}{\to} 0$.
One writes $X_n \stackrel{L^p}{\to}X$.
\end{defi}
\label{Page:Lp}
Of course this amounts to say that $\norme{X_n-X}_p$ goes to zero. Let us note that if $X_n \stackrel{L^q}{\to}X$ for some $q$, then 
$X_n \stackrel{L^p}{\to}X$ for every $p<q$, since
$$
\norme{X_n-X}_p\leq \underbrace{\norme{X_n-X}_q}_{\to 0}.
$$

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\newpage
\subsection*{The special case of $L^2$}
Let $X,Y$ be in some $L^2(\Omega,\Trib,\Prob)$, it turns out that $XY$ is integrable, due to the following:

\begin{theo}[Cauchy\index{Cauchy-Schwarz's inequality}-Schwarz's inequality]
If $X,Y$ are in $L^2$, then 
$$
|\EE[XY]|\leq \EE[|XY|] \leq \EE[X^2]^{1/2}\EE[Y^2]^{1/2}.
$$
In particular, since the right-hand side is finite, $X\times Y \in L^1$.
\end{theo}
The first inequality is not new, the second one has a nice proof:
{\small \begin{proof}[(sketch of)]
Take some real number $t$, we obviously have
\begin{align*}
0&\leq \EE[(t|X|+|Y|)^2]\\
&= t^2\EE[|X|^2]+2t\EE[|XY|]+\EE[|Y|^2]=:P(t).
\end{align*}
If we see $t$ as the variable, then $P(t)$ is a polynomial of order two whose sign does not change. Thus its discriminant is non-positive:
$$
4\EE[|XY|]^2-4\EE[X^2]\EE[Y^2]\leq 0,
$$
which can be rewritten as $\EE[|XY|] \leq \EE[X^2]^{1/2}\EE[Y^2]^{1/2}$.
\end{proof}}
Since $\EE[XY]$ is finite , we can define the \emph{covariance\index{covariance}} of $X$ and $Y$ by
$$
\mathrm{Cov}(X,Y)=\EE[XY]-\EE[X]\EE[Y]
$$
(note that $\mathrm{Cov}(X,X)$ is just $\mathrm{Var}(X)$).

\medskip

It is often convenient to see the application 
$$
\begin{array}{c c c}
 L^2\times L^2 & \to & \mathbb{R}\\
 (X,Y)         & \mapsto & <X,Y>=\EE[XY]
\end{array}
$$
as a \emph{scalar product}, which means that the following properties hold:
\begin{enumerate}
\item {\bf Symmetry:} $<X,Y>= <Y,X>$
\item {\bf Linearity:} $ <aX+bX',Y>= a<X,Y>+ b<X',Y>$
\item {\bf Positivity:} $ <X,X>\ \geq 0$ 
\end{enumerate}
By analogy with the usual scalar product in geometry, we say that $X,Y$ are {\bf orthogonal} if $<X,Y>=0$. This analogy is very useful, especially when we will define conditional expectation.

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\newpage
\section{Pairs of random variables}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
Let $X,Y$ be two random variables, the {\bf joint} distribution of $(X,Y)$, denoted by $\Prob_{(X,Y)}$ is the measure on the Borel sets\footnote{The Borel $\sigma$-algebra on $\bbR^2$ is just the $\sigma$-algebra generated by open subsets of $\bbR^2$.} of $\bbR^2$ defined by
$$
\Prob_{(X,Y)}(A)=\Prob\left( (X,Y)\in A\right).
$$

Note that, as for single random variables, the law of $(X,Y)$ is characterized by the {\bf joint cumulative distribution function} defined by
$$
F_{(X,Y)}(s,t)=\Prob\left( \set{X\leq s} \cap \set{Y\leq t} \right).
$$
Yet the joint cdf are not very useful in practice.

\begin{rem}
Of course if you know the joint law of $(X,Y)$ then you can recover the law of each component: for some interval $[a,b]$ we have
$$
\Prob(X\in[a,b]) =\Prob\left( (X,Y)\in [a,b]\times \bbR\right)= \Prob_{(X,Y)}([a,b]\times \bbR).
$$
{\bf The converse is no true:} you can't recover the law of $(X,Y)$ if you only know the laws of $X$ and $Y$.
\end{rem}

When $(X,Y)$ takes its values in a countable set $A$, the law of $(X,Y)$ is fully determined by the probabilities
$$
p_{i,j}:=\Prob\left( X=i,Y=j\right)
$$
and then one can very easily get the law of $X$ just by summing over $j$:
$$
\Prob(X=i)=\Prob\left( \bigcup_j \set{X=i,Y=j}\right)= \sum_j \Prob\left( X=i,Y=j\right)=\sum_j p_{i,j}.
$$

\subsection{Joint densities and the Fubini Theorems}
We first need a few propositions in order to properly define and handle multiple integrals. 

\begin{theo}[The first Fubini\index{Fubini Theorems} Theorem]
Let $f$ be a {\bf non-negative} measurable function
$$
\begin{array}{r c c c}
f: & \mathbb{R}\times \mathbb{R} & \to & \mathbb{R}_+\\
   & (x,y)         & \mapsto & f(x,y).
\end{array}
$$
Then we can integrate $f$ in both ways:
$$
\int_{y\in \mathbb{R}} \left( \int_{x\in \mathbb{R}} f(x,y)dx\right) dy
=\int_{x\in \mathbb{R}} \left( \int_{y\in \mathbb{R}} f(x,y)dy\right) dx.
$$
Then we can without ambiguity denote this quantity by
$$
\int_{\mathbb{R}\times \mathbb{R}} f(x,y)dxdy.
$$
\end{theo}

\begin{theo}[The second Fubini Theorem]
Let $f$ be a measurable function
$$
\begin{array}{r c c c}
f: & \mathbb{R}\times \mathbb{R} & \to & \mathbb{R}\\
   & (x,y)         & \mapsto & f(x,y).
\end{array}
$$
Assume that $\int\hspace{-1mm}\int |f|dxdy$ is finite (you check this with the first Fubini Theorem), then we can integrate $f$ in both ways:
$$
\int_{y\in \mathbb{R}} \left( \int_{x\in \mathbb{R}} f(x,y)dx\right) dy
=\int_{x\in \mathbb{R}} \left( \int_{y\in \mathbb{R}} f(x,y)dy\right) dx.
$$
We denote this quantity by
$$
\int_{\mathbb{R}\times \mathbb{R}} f(x,y)dxdy.
$$
\end{theo}

\begin{defi}[Joint density]
We say that $(X,Y)$ has joint density\index{joint density} $f:\bbR^2\to\bbR_+$ if for any Borel set $A$
$$
\Prob\left( (X,Y)\in A\right)=\int\hspace{-2mm}\int_A f(x,y)dxdy. 
$$
In this case, for every measurable function $\phi$,
\begin{equation}\label{Eq:Transfert}\tag{$\ast$}
\EE\left[\phi(X,Y)\right]=\int\hspace{-2mm}\int \phi(x,y)f(x,y)dxdy. 
\end{equation}
(when this quantity is well-defined).
\end{defi}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsubsection*{$\rhd\ $Marginal densities}
Let $(X,Y)$ have density $f$ and let us compute the density of $X$: for any bounded and continuous function $\phi$ of a single variable $x$, we can apply formula \eqref{Eq:Transfert} just above and get
\begin{align*}
\EE\left[\phi(X)\right]&=\int\hspace{-2mm}\int \phi(x)f(x,y)dxdy\\ 
&=\int  \phi(x) \underbrace{\left(\int f(x,y)dy\right)}_{\text{ density of }X} dx \text{ (by the 2d Fubini Theorem).}
\end{align*}
\begin{prop}
If $(X,Y)$ has density $(x,y)\mapsto f(x,y)$ then $X$ has density $x\mapsto \int f(x,y)dy$, it is called the \emph{marginal\index{marginal density} density} of $X$. Similarly, $Y$ has density $y\mapsto \int f(x,y)dx$.
\end{prop}
\begin{example} Let $(X,Y)$ have density $(x+y)\mathds{1}_{[0,1]\times[0,1]}(x,y)$.
\begin{itemize}
\item Let us first check that this is indeed a density: it is clearly non-negative and by first Fubini's Theorem
\begin{align*}
\int\hspace{-2mm}\int_{[0,1]\times[0,1]} (x+y)dxdy &= \int_{x\in[0,1]} \left(\int_{y\in [0,1]} (x+y)dy\right) dx\\
&= \int_{x\in[0,1]} \left(\int_{y\in [0,1]} xdy +\int_{y\in [0,1]}ydy\right) dx\\
&=  \int_{x\in[0,1]} (x+ 1/2) dx\\
&= 1/2+1/2=1.
\end{align*}
\item Let us compute the density of $X$: by the proposition it is equal to
$$
x\mapsto 
\begin{cases}
&\int_{y\in[0,1]} (x+y)dy = x+1/2\text{ if }x\in[0,1],\\
&0\text{ otherwise.}
\end{cases}
$$
\end{itemize}
\end{example}

\subsubsection*{$\rhd\ $Multivariate characteristic function}
\begin{prop}
As for single-valued random variables, the law of $(X_1,X_2)$ is characterized by its \emph{multivariate characteristic function}\index{characteristic function!multivariate characteristic function} $\Phi_{(X_1,X_2)}(t_1,t_2)$, defined by
$$
\begin{array}{r c c c}
\Phi_{(X_1,X_2)}: & \bbR^2 & \to & \mathbb{C}\\
           & (t_1,t_2)  & \mapsto & \EE\left[\exp(it_1 X_1+it_2 X_2)\right].
\end{array}
$$
\end{prop}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\newpage
\section{Independence}
\newcommand{\ind}{independent}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Definitions}
\begin{defi}[Independence of random variables and $\sigma$-algebras]\ 
\begin{itemize}
\item Let $X_1,X_2,\dots,X_n$ be random variables on the same space $(\Omega,\Trib,\Prob)$. We say that $X_1,\dots,X_n$ are \emph{independent} if for any Borel sets $B_1,\dots,B_n$ we have
$$
\Prob\left( X_1\in B_1, X_2\in B_2, \dots,X_n\in B_n\right)=\prod_{i=1}^n \Prob(X_i\in B_i).
$$
\item Let ${\Trib}_1,{\Trib}_2,\dots,{\Trib}_n$ be sub-$\sigma$-algebras of $\Trib$. We say that ${\Trib}_1,{\Trib}_2,\dots,{\Trib}_n$ are \emph{independent} if for any measurable sets $A_1,\dots,A_n\in\Trib$ we have
$$
\Prob\left( A_1\cap A_2 \cap \dots\cap A_n\right)=\prod_{i=1}^n \Prob(A_i).
$$
\end{itemize}
\end{defi}
We say that a sequence of random variables $X_1,X_2,\dots$ is \emph{i.i.d.} (independent and identically distributed) if, for every $n$, $X_1,\dots,X_n$ are independent and if $X_i$'s all have the same law.

\medskip

\begin{center}\emph{
Independence of events is more subtle. For $A_1,A_2,\dots,A_n$ to be independent, we have to check independence of every sub-family of $A_i$'s: }
\end{center}
\begin{defi}[Independence of events]
Let $A_1,A_2,\dots,A_n$ be measurable events, $\emph{i.e.}$ sets of $\Trib$. We say that $A_1,\dots,A_n$ are \emph{independent} if for every $k\leq n$ and every $1\leq i_1 < i_2 < \dots i_k \leq n$ we have
$$
\Prob\left( A_{i_1}\cap A_{i_2} \cap \dots\cap A_{i_k}\right)=\Prob(A_{i_1})\times \Prob(A_{i_2})\times\dots \times\Prob(A_{i_k}).
$$
\end{defi}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{"independence means multiply"}
If $X,Y$ are \ind\ then by definition we have
$$
\Prob( X\leq s,Y\leq t)=\Prob( X\leq s)\Prob(Y\leq t)=F_{X}(s)F_{Y}(t).
$$
\begin{center}\emph{
$\Rightarrow$ The joint cdf if the product of cdf's! }
\end{center}
More generally,
\begin{prop}
If $X_1,X_2,\dots,X_n$ are \ind\ and $\phi_1,\dots, \phi_n$ are measurable functions, then
$$
\EE\left[\phi_1(X_1)\phi_2(X_2)\dots \phi_n(X_n)\right]=\EE[\phi_1(X_1)]\EE[\phi_2(X_2)]\dots \EE[\phi_n(X_n)].
$$
(if both sides are well-defined: if all $\phi_k$'s are non-negative, or bounded, or such that each $\phi_k(X_k)$ is integrable.)
\end{prop}
An interesting consequence is that if $X$ and $Y$ are \ind\ then
$$
\mathrm{Cov}(X,Y)=\EE[XY]-\EE[X]\EE[Y]=\EE[X]\EE[Y]-\EE[X]\EE[Y]=0.
$$
But the converse is not true: there exist r.v. $X,Y$ that are {\bf not} \ind\ for which $\mathrm{Cov}(X,Y)=0$.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsubsection*{$\rhd\ $Independence and densities}
\begin{theo} Let $X,Y$ be two random variables.
\begin{itemize}
\item If $X,Y$ have densities $f_X$ and $f_Y$, and if $X,Y$ are \ind, then
$$
(X,Y) \text{ has density } f_{(X,Y)}(x,y)= f_X(x)f_Y(y).
$$
\item Conversely, if $(X,Y)$ has a density which can be written as a product $g_1(x)\times g_2(y)$ then $X,Y$ are \ind.
\end{itemize}
\end{theo}
\begin{center}
\includegraphics[width=7cm]{Figures/Gaussienne2D.jpg}\\
\emph{The density of $(X,Y)$ where $X,Y$ are independent $\mathcal{N}(0,1)$.}
\end{center}
\medskip

\begin{example}
Assume $(X,Y)$ has density $6x^2y\mathds{1}_{(x,y) \in [0,1]^2}$ (exercise: check that this is a density). Then the theorem says that $X,Y$ are independent since one can write
$$
6x^2y\mathds{1}_{[0,1]\times [0,1]}(x,y)=6x^2\mathds{1}_{x\in[0,1]} \times y\mathds{1}_{y\in [0,1]}.
$$
Though, we must take care to constants if we look for marginal densities: the density of $X$ is
$$
\int_{y=0}^1 6x^2ydy =6x^2 \int_{y=0}^1 ydy=3x^2\text{ for }x\in[0,1].
$$
Finally,
$$
6x^2y\mathds{1}_{x,y \in [0,1]} = \underbrace{3x^2\mathds{1}_{x\in[0,1]}}_{\text{ density of }X} \times \underbrace{2y\mathds{1}_{y\in [0,1]}}_{\text{ density of }Y}.
$$
\end{example}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Sums of independent random variables}
Let $X,Y$ be random variables, what can we say about $X+Y$ ? First, by linearity of expectation
$$
\EE[X+Y]=\EE[X]+\EE[Y].
$$
Now, assume that $X,Y$ are \ind. Then
\begin{align*}
\mathrm{Var}(X+Y)&=\EE[(X+Y)^2]-\EE[X+Y]^2\\
                 &=\EE[X^2+Y^2+2XY]-\EE[X]^2-\EE[Y]^2-2\EE[X]\EE[Y]=\mathrm{Var}(X)+\mathrm{Var}(Y).
\end{align*}
More generally, if $X_1,\dots,X_n$ are \ind, then expectations and variances add up:
\begin{align*}
\EE[X_1+\dots+X_n]&=\EE[X_1]+\dots +\EE[X_n],\\
\mathrm{Var}(X_1+\dots+X_n)&=\mathrm{Var}(X_1)+\dots +\mathrm{Var}(X_n) \ \ \text{(here }X_k\text{'s need to be {\bf independent}!)}
\end{align*}

\medskip

\begin{center}\emph{
More precisely, what can we say about the distribution of the sum of independent random variables?}
\end{center}
Assume first that $X,Y$ have densities $f,g$. We want to compute the density (if any) of $X+Y$. Take a bounded and continuous function $\phi$ and compute
\begin{align*}
\EE[\phi(X+Y)]&=\int\hspace{-1mm}\int \phi(x+y)f(x)g(y)dxdy\\
		  &=\int\hspace{-1mm}\int \phi(u)f(x)g(u-x)dxdu\text{ (by ch. of variables }u=x+y,\ du/dy=1)\\
                  &=\int_{u} \phi(u) \underbrace{\left(\int_{x} f(x)g(u-x)dx\right)}_{\text{ density of }\phi(X+Y)} du.
\end{align*}
Then we have a formula for the density of $X+Y$. It is called the \emph{convolution}\index{convolution} of $f$ and $g$:
\begin{defitheo}[Convolution of two densities]
Let $X,Y$ be two independent random variables with densities $f$ and $g$. Then $X+Y$ has a density, it is denoted $f\star g$ and defined by 
$$
f\star g(u)=\int_x f(x)g(u-x)dx.
$$
The function $u\mapsto f\star g(u)$ is said to be the \emph{convolution} of $f$ and $g$.
\end{defitheo}
\begin{example} Let $X,Y$ be i.i.d., with the exponential distribution, \emph{i.e.} $f(x)=e^{-x}\mathds{1}_{x\geq 0}$ and $g(y)=e^{-y}\mathds{1}_{y\geq 0}$. The random variable $X+Y$ also takes its values in $[0,+\infty)$ and by the previous computation, the density of $X+Y$ is given by
\begin{align*}
\text{(for all }u\geq 0),\qquad \int_{x=0}^\infty f(x)g(u-x)dx&= \int_{x=0}^\infty e^{-x}e^{-(u-x)}\mathds{1}_{u-x \geq 0}\ dx\\
&=e^{-u} \int_{x=0}^\infty \mathds{1}_{u-x \geq 0}\ dx=e^{-u} \int_{x=0}^\infty \mathds{1}_{x \leq u}\ dx\\
&=e^{-u} \int_{x=0}^u dx=ue^{-u}.\\
\end{align*}
Then $X+Y$ has density $ue^{-u}\mathds{1}_{u\geq 0}$ (you can check that it is a density).
\end{example}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsubsection*{$\rhd\ $Sums of random variables and characteristic functions}
Another very efficient tool for the sum of r.v. is the use of characteristic functions (in fact this is the very reason for which characteristic functions are introduced in this course). Indeed, we have
$$
\Phi_{X+Y}(t)=\EE[e^{it(X+Y)}]=\EE[e^{itX}e^{itY}]\stackrel{\text{by ind.}}{=}\EE[e^{itX}]\EE[e^{itY}]=\Phi_X(t)\Phi_Y(t),
$$
the CF of a sum of \ind\ random variables is the product of CFs!
\begin{example}
Let  $X,Y$ be \ind\ $\mathcal{N}(0,1)$. What is the law of $X+Y$ ?
$$
\Phi_{X+Y}(t)=\Phi_X(t)\Phi_Y(t)=e^{-t^2/2}e^{-t^2/2}=e^{-t^2\times 2/2},
$$
and we recognize the CF of a $\mathcal{N}(0,2)$ (see Proposition \ref{Prop:CFGauss}). So we have (if independence)
$$
\mathcal{N}(0,1)+\mathcal{N}(0,1)\sim \mathcal{N}(0,2).
$$
\end{example}
More generally, we can prove that:
\begin{prop}[Sum of two independent gaussian r.v.]\label{Prop:SumGaussian}
If $X_1\sim\mathcal{N}(\mu_1,\sigma_1^2)$, $X_2\sim\mathcal{N}(\mu_2,\sigma_2^2)$ and if $X,Y$ are \ind, then $X+Y$ is also a gaussian random variable. More precisely,
$$
X_1+X_2\sim\mathcal{N}(\mu_1+\mu_2,\sigma_1^2+\sigma_2^2).
$$
\end{prop}

\medskip

\noindent In the same spirit, you can use CF's to prove the following:
\begin{exo} Let $X,Y$ be two independent random variables, both having the Binomial distribution $\mathrm{Bin}(n,p)$ with parameters $(n,p)$.
Check that $\Phi_X(t)=(1-p+pe^{it})^n$ and prove that $X+Y\sim \mathrm{Bin}(2n,p)$.
\end{exo}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{The Borel-Cantelli\index{Borel-Cantelli Lemmas} lemmas}
Given events $A_1,A_2,\dots $, a main concern is often "how many of the $A_n$'s occur?". 
The first Borel-Cantelli Lemma says that if $\Prob(A_n)$ is small enough, then $A_n$ cannot occur infinitely often.
Recall that
$$
\limsup_{n\to \infty} A_n=\text{ "$A_n$ occurs infinitely often" } = \bigcap_{p\geq 1} \bigcup_{n\geq p} A_n.
$$
\begin{theo}[First Borel-Cantelli Lemma]
If $\sum_{n\geq 1} \Prob(A_n) <+\infty$ then $A_n$ cannot occur infinitely often:
$$
\Prob\left(\limsup_{n\to\infty} A_n\right) =0.
$$
\end{theo}
On the contrary, if $\Prob(A_n)$ is not too small, then we can prove that $A_n$ occurs infinitely often. This is the second Borel-Cantelli Lemma, but we need the extra assumption that $A_n$'s are independent:
\begin{theo}[Second Borel-Cantelli Lemma]
If $\sum_{n\geq 1} \Prob(A_n) =+\infty$ and if furthermore  $A_1,A_2,\dots $ are independent, then $A_n$ occurs infinitely often:
$$
\Prob\left(\limsup_{n\to\infty} A_n\right) =1.
$$
\end{theo}
You can skip the proofs of Borel-Cantelli lemmas, more important is the application just below, which helps to understand how BC's lemmas work in practice.
{\small
\begin{proof}[of the first Borel-Cantelli lemma]
We consider the random variable $\sum_{n\geq 1} \mathds{1}_{A_n}$ that counts the number of $A_n$'s that occur. By positivity one can swap $\sum$ and $\EE$ and then
$$
\EE\left[ \sum_{n\geq 1} \mathds{1}_{A_n} \right] = \sum_{n\geq 1} \EE\left[ \mathds{1}_{A_n} \right]=\sum_{n\geq 1} \Prob(A_n)<+\infty.
$$
Then $\sum_{n\geq 1} \mathds{1}_{A_n}$ has finite expectation, thus it is finite with probability $1$: $A_n$ occurs finitely many times.
\end{proof}
\begin{proof}[of the second Borel-Cantelli lemma]
Fix $p\geq 1$,
\begin{align*}
\Prob\left(A_n\text{ doesn't occur for }n\geq p \right)&\stackrel{\phantom{\text{by indep.}}}{=}\Prob\left(\overline{A_p}\cap \overline{A_{p+1}} \cap \overline{A_{p+2}} \cap\dots \right)\\
&\stackrel{\text{by indep.}}{=} \prod_{n\geq p} \Prob(\overline{A_n})=\prod_{n\geq p} \left(1-\Prob(A_n)\right).
\end{align*}
Now, we use the fact that $1-x\leq e^{-x}$ for any real $x$:
$$
\Prob\left(A_n\text{ doesn't occur for }n\geq p \right)\leq \prod_{n\geq p} \exp\left(-\Prob(A_n)\right)
= \exp\left( -\sum_{n\geq p} \Prob(A_n)\right)=\exp(-\infty)=0.
$$
Then
\begin{align*}
\Prob\left(A_n\text{ occurs finitely many times }\right)&=\Prob\left(\cup_{p\geq 1} A_n\text{ doesn't occur for }n\geq p \right)\\
&\leq \sum_{p\geq 1} \Prob\left(A_n\text{ doesn't occur for }n\geq p \right)=\sum_{p\geq 1} 0=0
\end{align*}
(we used item 3. in Proposition \ref{Prop:Mesures}). This proves that $\Prob\left(\limsup_{n\to\infty} A_n\right) =1$.
\end{proof}}

\subsection*{An application of Borel-Cantelli's lemmas: extreme values}
Let $X_1,X_2,\dots$ be i.i.d. exponential random variables with parameter $1$. We have for each $n$
$$
\Prob(X_n\geq t)=\int_t^{+\infty}e^{-u}du=e^{-t}.
$$
What can we say about very large values of the sequence $(X_n)$ ? First of all, it should be obvious that this infinite sequence is not bounded. To prove so, take a huge number, $10^{100}$ say. We have, for each $n$, $\Prob(X_n\geq 10^{100})=e^{-10^{100}}>0$. Then
$$
\sum_{n\geq 1} \Prob(X_n\geq 10^{100}) =\sum_{n\geq 1} e^{-10^{100}}=+\infty.
$$
Then, applying Borel-Cantelli n.2 with events $A_n=\set{X_n\geq 10^{100}}$ (which are independent) shows that $A_n$ occurs infinitely often : $\set{X_n\geq 10^{100}}$ for infinitely many $n$'s.

Now we would like to be more precise: for a fixed $n$, how large are extreme values among $X_1,\dots,X_n$ ? We can prove that they are of order $\log(n)$. First,
$$
\sum_{n\geq 1} \Prob(X_n\geq \log(n)) =\sum_{n\geq 1} e^{-\log(n)}=\sum_{n\geq 1} \frac{1}{n}=+\infty.
$$
Then, again by Borel-Cantelli n.2, we have $\set{X_n\geq \log(n)}$ for infinitely many $n$'s. On the other hand,
$$
\sum_{n\geq 1} \Prob(X_n\geq 2\log(n)) =\sum_{n\geq 1} e^{-2\log(n)}=\sum_{n\geq 1} \frac{1}{n^2}<+\infty.
$$
Hence, by Borel-Cantelli n.1, $\set{X_n\geq 2\log(n)}$ occurs only finitely many times. Here is a picture that sums up the situation:
\begin{center}
\includegraphics[width=105mm]{Figures/ExtremeValues.pdf}
\end{center}
We have precisely proved that, almost surely,
$$
1\leq \limsup_{n\to +\infty} \frac{X_n}{\log(n)} \leq 2.
$$

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Change of variables}
Let $(X,Y)$ be a pair of independent random variables with densities $f_X$ and $f_Y$ and assume that we want to compute the joint density of $\left(u(X,Y),v(X,Y)\right)$ for some nice functions $u,v$. To do this we have to compute
$$
\EE\left[\phi\left(u(X,Y),v(X,Y)\right)\right]=\int\hspace{-3mm}\int \phi\left(u(x,y),v(x,y)\right) f_X(x)f_Y(y)\mathrm{dxdy}
$$ 
for any bounded and continuous $\phi$.\\
The general method to do that is to use a two-dimensional change of variables, we won't make the detailed theory but rather run an example.
\begin{example}
Let $X,Y$ be two independent $\mathcal{N}(0,1)$, we would like to compute the joint density of $(X+Y,X-Y)$. We set $U=X+Y$ and $V=X-Y$,
$$
\EE\left[\phi(U,V)\right]=\int\hspace{-3mm}\int \phi\left(x+y,x-y\right) \frac{e^{-x^2/2}}{\sqrt{2\pi}}\frac{e^{-y^2/2}}{\sqrt{2\pi}}\mathrm{dxdy}.
$$
In the right-hand side we make the change of variables
$$
\begin{cases}
u=x+y,\\
v=x-y.
\end{cases}
\Leftrightarrow
\begin{cases}
x=(u+v)/2,\\
x=(u-v)/2.
\end{cases}
$$
If $(x,y)$ runs in all $\bbR^2$ then so does $(u,v)$ so the domain is still $\bbR^2$. We have to make the change $\mathrm{dxdy}\leftrightarrow \mathrm{dudv}$, we write the \emph{Jacobian matrix}
$$

\mathrm{Jac}(x,y)=
{\Large
\begin{pmatrix}
\frac{\partial x}{\partial u} & \frac{\partial x}{\partial v}\\
\frac{\partial y}{\partial u} & \frac{\partial y}{\partial v}.
\end{pmatrix}
=
\begin{pmatrix}
\frac{\partial }{\partial u}\frac{u+v}{2} & \frac{\partial }{\partial v}\frac{u+v}{2}\\
\frac{\partial }{\partial u}\frac{u-v}{2} & \frac{\partial }{\partial v}\frac{u-v}{2}.
\end{pmatrix}
}
=
\begin{pmatrix}
1/2 & 1/2\\
1/2 & -1/2.
\end{pmatrix}
$$
Now you have to remember that
$$
\frac{\mathrm{dudv}}{ \mathrm{dxdy}}=\left|\mathrm{det}(\mathrm{Jac}(u,v))\right|=\left|\frac{1}{2}\times (-\frac{1}{2})-\frac{1}{2}\times\frac{1}{2}\right|=|-1/2|=1/2.
$$

\end{example}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\newpage
\subsection{Gaussian random variables and gaussian vectors}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
Recall that if $X\sim \mathcal{N}(\mu,\sigma^2)$ then  $\EE[X]=\mu$, $\mathrm{Var}(X)=\sigma^2$ and
$$
\Prob(X\in A)=\int_A \frac{1}{\sigma\sqrt{2\pi}} \exp\left(-\frac{(x-\mu)^2}{2\sigma^2}\right)dx
\qquad \text{ and } \qquad
\Phi_X(t)=\exp\left(it\mu-\frac{t^2\sigma^2}{2}\right).
$$
We are now interested in \emph{multivariate} gaussian random variables.
\begin{defi}[Gaussian vectors] A random vector $\mathbf{X}=(X_1,\dots,X_d)$ is a \emph{gaussian\index{gaussian vector!definition} vector} (one also says that $\mathbf{X}$ follows the \emph{multivariate gaussian distribution}) if for every real numbers $t_1,t_2,\dots ,t_d$ the linear combination
$$
t_1X_1 +t_2X_2 +\dots+t_dX_d
$$
is a gaussian random variable (here, by convention, we say that the constant random variable $c$ follows the gaussian distribution $\mathcal{N}(c,0)$).
\end{defi}
\begin{rem}
\begin{enumerate}
\item If $X_1,\dots,X_d$ are independent with $X_i\sim\mathcal{N}(\mu_i,\sigma^2_i)$, then Proposition \ref{Prop:SumGaussian} tells that the linear combination
$$
t_1X_1 +t_2X_2 +\dots+t_dX_d\sim\mathcal{N}\left(\ t_1\mu_1+\dots +t_d\mu_d\ ,\ t_1^2\sigma_1^2+ \dots +t_d^2\sigma_d^2\ \right)
$$
and thus is gaussian: {\bf a vector made of independent gaussian r.v. is a gaussian vector}.
\item By taking $t_i=1$ and $t_j=0$ for $j\neq i$, the definition tells that $X_i$ is gaussian: {\bf every component of a gaussian vector is gaussian}.
\end{enumerate}
\end{rem}
The following exercise shows that the converse is not true: it is not enough that each component of $\mathbf{X}$ is gaussian for $\mathbf{X}$ to be a gaussian vector:
\begin{exo}
Let $X\sim\mathcal{N}(0,1)$ and let $\eps=\pm 1$ with probability $1/2$ independently from $X$. 
\begin{enumerate}
\item Prove that $\eps X$ is a gaussian random variable (for instance with the characteristic function).
\item Compute $\Prob(X-\eps X=0)$ and explain why this proves that $(X,\eps X)$ is not a gaussian vector.
\end{enumerate}
\end{exo}

\medskip

\begin{defi}[Parameters of a gaussian vector]
Let $\mathbf{X}$ be a gaussian vector. The \emph{mean vector} $\mu$ of $\mathbf{X}$ is the column vector of expectations:
$$
\mu=\begin{pmatrix}
\EE[X_1]\\
\EE[X_2]\\
\vdots\\
\EE[X_d]
\end{pmatrix}
$$
and the \emph{covariance\index{covariance matrix} matrix} $C$ of $\mathbf{X}$ is the $d\times d$ matrix where entry $C_{i,j}=  \mathrm{Cov}(X_i,X_j)$:
$$
C=
\bordermatrix{
  &  & j & \cr
  &  & \vdots & \cr
%  &  &  & \cr
i\ \   &  \dots  & \mathrm{Cov}(X_i,X_j) & \dots \cr
  &  & \vdots &
}
$$
\end{defi}
\begin{example} If $\mathbf{X}$ is made of i.i.d. $X_1,\dots,X_d$ with $X_i\sim\mathcal{N}(0,1)$, then $\mathrm{Cov}(X_i,X_i)=\mathrm{Var}(X_i)=1$, and  $\mathrm{Cov}(X_i,X_j)=0$ for $i\neq j$ (by independence). Then $C$ is the identity matrix:
$$
C=
\begin{pmatrix}
1 & 0   & \cdots & 0 \cr
0 & 1   &   &  \vdots \cr
\vdots  &  & \ddots  & \cr
0  & \cdots & & 1
\end{pmatrix}
$$

\end{example}

Since $\mathrm{Cov}(X_i,X_j)=\mathrm{Cov}(X_j,X_i)$, the matrix $C$ is always symmetric. It is less obvious, but true, that $C$ is also positive-semi-definite, which means that for all vector $\mathbf{t}=(t_1,\dots,t_d)\in\bbR^d$ one has the inequality
$\mathbf{t}'C \mathbf{t} \geq 0$,
where $\mathbf{t}'$ is the \emph{transpose} of $\mathbf{t}$. Recall that
$$
\mathbf{t}'C \mathbf{t}=\mathbf{t}'\times\left(C \mathbf{t}\right)=
\mathbf{t}'\times
\begin{pmatrix}
\sum_j C_{1,j}t_j\\
\sum_j C_{2,j}t_j\\
\vdots\\
\sum_j C_{d,j}t_j
\end{pmatrix}
=
\sum_{i,j} t_iC_{i,j}t_j.
$$
\begin{defitheo}
Let $\mathbf{X}$ be a gaussian vector with mean vector $\mu$ and covariance matrix $C$. Then the \emph{multivariate characteristic function} of $\mathbf{X}$ 
is defined by
$$
\begin{array}{r c c c}
\Phi_{\mathbf{X}}: & \bbR^d & \to & \mathbb{C}\\
           & \mathbf{t}=(t_1,\dots,t_d)  & \mapsto & \EE\left[\exp(it_1 X_1+\dots +it_d X_d)\right].
\end{array}
$$
and for all $\mathbf{t}$ we have the formula
\begin{equation}\label{Eq:FourierGaussienne}\tag{$\#$}
\Phi_{\mathbf{X}}(\mathbf{t})=\exp\left(i\mathbf{t}'\mu -\frac{\mathbf{t}'C\mathbf{t}}{2} \right).
\end{equation}
\end{defitheo}
A consequence of this expression is that if $\mathbf{X}$ and $\mathbf{Y}$ are two gaussian vectors with the same mean vector and the same covariance matrix, then $\Phi_{\mathbf{X}}(\mathbf{t}) =\Phi_{\mathbf{Y}}(\mathbf{t})$ for all $\mathbf{t}$ and then $\mathbf{X}$ and $\mathbf{Y}$ have the same law.
\begin{center}\emph{
$\Rightarrow$ The distribution of a gaussian vector is fully characterized by $\mu$ and $C$ !}
\end{center}
\begin{proof}[of formula \eqref{Eq:FourierGaussienne}]
We fix a vector $\mathbf{t}=(t_1,\dots,t_d)$ in $\bbR^d$. Since $\mathbf{X}$ is a gaussian vector, the random variable $Y$ defined by the linear combination $Y=t_1 X_1+\dots +t_d X_d$ is a gaussian random variable. We have 
$$
\Phi_{\mathbf{X}}(\mathbf{t}) =\EE\left[\exp\left(i\left( t_1 X_1+\dots +it_d X_d\right) \right)\right]
=\EE\left[\exp (i\times 1\times Y) \right]=\Phi_Y(1)
$$
so it suffices to compute the characteristic function of $Y$. To do so, it is enough (since $Y$ is gaussian) to compute $\EE[Y]$ and $\mathrm{Var}(Y)$. First, by linearity of expectation,
$$
\EE[Y]=\EE\left[ t_1 X_1+\dots +t_d X_d \right]=t_1 \EE[X_1]+\dots + t_d \EE[X_d]=\mathbf{t}'\mu.
$$
Let us now compute $\EE[Y^2]$:
\begin{align*}
\EE[Y^2]&=\EE\left[ (t_1 X_1+\dots +t_d X_d)^2 \right]=\sum_{i,j} t_it_j \EE[X_iX_j]\\
\EE[Y]^2&=\left( \sum_{i} t_i\EE[X_i] \right)^2= \sum_{i,j} t_it_j \EE[X_i]\EE[X_j].
\end{align*}
Hence
$$
\mathrm{Var}(Y)=\sum_{i,j} t_it_j \left(\EE[X_iX_j] -\EE[X_i]\EE[X_j]\right)=\sum_{i,j} t_it_j C_{i,j}=\mathbf{t}'C\mathbf{t}.
$$
The proof is finished since by Proposition \ref{Prop:CFGauss}
$$
\Phi_Y(1)=\exp\left(i\times 1 \times \EE[Y] -1^2\times \frac{\mathrm{Var}(Y)}{2} \right)=\exp\left(i\mathbf{t}'\mu -\frac{\mathbf{t}'C\mathbf{t}}{2} \right).
$$
\end{proof}
In these notes, we will admit (and won't use) the following formula:

\begin{prop}[Density\index{gaussian vector!density} of a gaussian vector]
Let $\mathbf{X}$ be a gaussian vector with mean vector $\mu$ and covariance matrix $C$.\\
If $C$ is invertible (\emph{i.e.} there exists $C^{-1}$ such that $C^{-1}\times C=\mathrm{Id}$, this implies $\mathrm{det}(C)\neq 0$) then $\mathbf{X}$ has a density on $\bbR^d$: for all Borel set of $\bbR^d$,
$$
\Prob(\mathbf{X}\in A)=\int\hspace{-2mm}\int\hspace{-1mm}\dots \hspace{-1mm}\int_A \hspace{2mm}\frac{1}{\sqrt{(2\pi)^d\mathrm{det}(C)}}\exp\left(-\tfrac12 (\mathbf{x}-\mu)'C^{-1}(\mathbf{x}-\mu) \right)dx_1dx_2\dots dx_d,
$$
where $\mathbf{x}=(x_1,\dots,x_d)$.
\end{prop}
\begin{rem}
\begin{itemize}
\item When $d=1$ the matrix $C$ is simply $\begin{pmatrix}\mathrm{Var}(X)\end{pmatrix}$ and obviously $\mathrm{det}(C)=\mathrm{Var}(X)$. Thus you recognize the density of the (one-dimensional) gaussian distribution.
\item When $\mathrm{det}(C)= 0$ then one can prove that $\mathbf{X}$ lies in a strict sub-space of $\bbR^d$ (for which Lebesgue measure is zero) and then $\mathbf{X}$ has no density.
\end{itemize}
\end{rem}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\newpage
\section{Convergences of random variables}\label{Chap:Convergences}
\subsection{$\neq$ kinds of convergences of random variables} 
Let $X_1,X_2,\dots$ be random variables defined on the same probability space $(\Omega,\Trib,\Prob)$ and $X$ another random variable defined on $\Omega$. What does it mean that $(X_n)_{n\geq 1}$ converges to $X$?
\begin{defi}
\begin{itemize}
\item The sequence $(X_n)_{n\geq 1}$ converges to $X$ \emph{in probability} if for all real number $\eps>0$ 
$$
\Prob\left(|X_n-X| >\eps\right) \stackrel{n\to +\infty}{\to} 0.
$$
One writes $(X_n) \stackrel{\text{prob.}}{\to} X$.
\item The sequence $(X_n)_{n\geq 1}$ converges to $X$ \emph{in $L^p$} if
$$
\EE\left[\ \left|X_n-X\right|^p\ \right] \stackrel{n\to +\infty}{\to} 0.
$$
One writes $(X_n) \stackrel{L^p}{\to} X$.
\item The sequence $(X_n)_{n\geq 1}$ converges to $X$ \emph{almost surely} if
$$
\Prob\left(X_n\stackrel{n\to\infty}{\longrightarrow} X\right)=\Prob\left(\omega\text{ such that }X_n(\omega)\stackrel{n\to\infty}{\longrightarrow} X(\omega) \right) =1.
$$
One writes $(X_n) \stackrel{\text{a.s.}}{\to} X$.
\end{itemize}
\end{defi}

\noindent Here is an example that shows that these three kinds of convergence are NOT equivalent:\\
\begin{example} $\left(\stackrel{\text{prob.}}{\to}\ \neq\ \stackrel{L^p}{\to}\ \neq\ \stackrel{\text{a.s.}}{\to}\right)$
Take $X_1,X_2,\dots$ be a sequence of independent random variables such that
$$
X_n=\begin{cases}
\sqrt{n} &\text{ with probability }\tfrac1n,\\
0 &\text{ with probability }1-\tfrac1n.
\end{cases}
$$
When $n$ goes large, $X_n$ is more and more likely to be zero, so we expect $(X_n)_{n\geq 1}$ to converge (at least in some sense) to zero.\\
Let us first check that $(X_n) \stackrel{\text{prob.}}{\to} 0$: fix a small $\eps>0$, we have
$$
\Prob\left(|X_n-0| >\eps\right) = \Prob\left(X_n =\sqrt{n} \right) =1/n \stackrel{n\to +\infty}{\to} 0.
$$
Let us now consider the convergence to $0$ in $L^p$. 
$$
\EE\left[\ \left|X_n-0\right|^p\ \right]= \EE[(X_n)^p] = 0\times (1-\tfrac1n)+(\sqrt{n})^p\times\tfrac1n =n^{p/2-1}.
$$
This goes to zero for $p<2$: $(X_n) \stackrel{L^p}{\to} X$ for all $1\leq p<2$.\\
Almost sure convergence is more delicate, we need the second Borel-Cantelli Lemma. $X_n$'s are independent and we have
$$
\sum_{n\geq 1} \Prob(X_n=\sqrt{n})= \sum_{n\geq 1} \frac1n =+\infty.
$$
Then, with probability one, "$X_n=\sqrt{n}$" is true for infinitely many $n$'s. In particular the sequence $(X_n)_{n\geq 1}$ is not bounded and does not converge to zero.
\end{example}

Convergence in probability is in fact the "weakest" of all:
\begin{prop}
\begin{itemize}
\item If $(X_n) \stackrel{L^p}{\to} X$, then  $(X_n) \stackrel{\text{prob.}}{\to} X$.
\item If $(X_n) \stackrel{\text{a.s.}}{\to} X$, then  $(X_n) \stackrel{\text{prob.}}{\to} X$.
\end{itemize}
\end{prop}
Recall also that we saw page \pageref{Page:Lp} that if $X_n \stackrel{L^q}{\to}X$ for some $q$, then 
$X_n \stackrel{L^p}{\to}X$ for every $p<q$ since
$$
\norme{X_n-X}_p\leq \underbrace{\norme{X_n-X}_q}_{\to 0}.
$$
\begin{proof}[of $\left(\stackrel{L^p}{\to}\ \Rightarrow\ \stackrel{\text{prob.}}{\to}\right)$]
Assume that $(X_n) \stackrel{L^p}{\to} X$ and fix $\eps >0$,
\begin{align*}
\Prob\left(|X_n-X| >\eps\right)&= \Prob\left(|X_n-X|^p >\eps^p\right), & & \text{ (this is the same event)}\\
&\leq \frac{\EE[\ |X_n-X|^p\ ]}{\eps^p}, & & \text{ (by Markov's inequality)}
\end{align*} 
which goes to zero by assumption ($\eps$ is fixed and $n\to +\infty$).
\end{proof}
\begin{proof}[of $\left(\stackrel{\text{a.s.}}{\to}\ \Rightarrow\ \stackrel{\text{prob.}}{\to}\right)$]
Assume that for almost every $\omega$, $X_n(\omega)\to X(\omega)$.  Let us write
$$
\Prob\left(|X_n-X| >\eps\right)=\Prob\left(\omega\text{ s.t. }|X_n(\omega)-X(\omega)| >\eps\right)= \EE\left[\mathds{1}_{|X_n-X| >\eps}(\omega)\right].
$$
Now, $\mathds{1}_{|X_n-X| >\eps}(\omega)$ is bounded by one and, by assumption, goes to zero for almost every $\omega$. Then, by dominated convergence, the latter  expectation goes to zero and the proposition is proved.
\end{proof}
In short:
\begin{center}
\includegraphics[width=14cm]{Figures/Convergences.pdf}
\end{center}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Laws of Large Numbers}
Basically, the \emph{Law\index{LLN (Law of Large Numbers)} of Large Numbers} (LLN) says that the average of i.i.d. random variables $X_1,X_2,\dots ,X_n$ gets closer and closer to $\EE[X]$.
We need a precise statement.
\begin{theo}[The weak Law of Large Numbers]
Let $X_1,X_2,\dots$ be a sequence of i.i.d. integrable random variables with expectation $\mu=\EE[X_1]$ and such that $\mathrm{Var}(X_1)<+\infty$,
$$
\frac{X_1+X_2+\dots +X_n}{n} \stackrel{n\to +\infty}{\to} \mu,
$$
where the convergence holds in $L^2$, and thus also in probability.
\end{theo}
The "weak" refers to the fact that the convergence only holds in $L^2$ and in probability, we will see just below a "strong" LLN.
\begin{proof}
The proof is fairly simple, and relies on the fact that the variance of independent r.v. is linear.
Set $S_n=X_1+X_2+\dots +X_n$ and recall that
$$
\EE\left[S_n/n \right]=\frac{\EE[X_1]+\dots +\EE[X_n]}{n}=\frac{n\mu}{n}=\mu.
$$
Now,
\begin{align*}
\EE\left[\left(\frac{S_n}{n}- \mu\right)^2\right] &= \EE\left[\left(\frac{S_n}{n}- \EE\left[\frac{S_n}{n}\right] \right)^2\right] & & \\
&= \mathrm{Var}\left(\frac{S_n}{n}\right)= \frac{1}{n^2}\mathrm{Var}(S_n) & &\text{ (recall formula \eqref{Eq:PropVariance} page \pageref{Eq:PropVariance})}\\
&=\frac{1}{n^2}\left(\mathrm{Var}(X_1)+\mathrm{Var}(X_2)+\dots +\mathrm{Var}(X_n) \right) & &  \text{ (by independence)}\\
&=\frac{1}{n^2}n\mathrm{Var}(X_1)=\frac{1}{n}\mathrm{Var}(X_1)\to 0, & &
\end{align*}
and the $L^2$ convergence is proved.
\end{proof}
In order to understand how to use the weak LLN, you might try to convince yourself that:
\begin{exo}
We toss a fair coin infinitely many times, and denote by $H_n$ the number of \emph{Heads} seen in the first n tosses.
Then
$$
\Prob\left(0.49n\leq H_n \leq 0.51n \right)\to 1
$$
(you also can prove this "by hand" with Chebyshev's inequality). 
\end{exo}
\begin{theo}[The strong Law of Large Numbers]
Let $X_1,X_2,\dots$ be a sequence of i.i.d. integrable random variables with expectation $\mu=\EE[X_1]$,
$$
\frac{X_1+X_2+\dots +X_n}{n} \stackrel{\text{a.s.}}{\to} \mu.
$$
\end{theo}
\noindent (Note that we dot not require $X_n$'s to have a finite variance.)\\
In fact, the strong LLN is a much deeper result than the weak one, and the proof is very difficult. Here is an easiest proof, if we assume that $\EE[X_1^4]<+\infty$ (which is often the case in practice). You can skip the proof.
{\small
\begin{proof}[of the strong LLN when {$\EE[X_1^4]<+\infty$}]\ \\
We first do the case $\mu=0$, let us compute
$$
\EE\left[\left(\frac{S_n}{n}\right)^4\right]=\frac{1}{n^4} \EE\left[(X_1+X_2+\dots +X_n)^4\right].
$$
When we expand the power $4$, five types of terms arise:
$$
\text{(i) } X_i^4\qquad \text{(ii) }X_i^3X_j\qquad \text{(iii) }X_i^2X_j^2 \qquad \text{(iv) }X_i^2X_jX_k\qquad \text{(v) }X_iX_jX_kX_\ell.
$$
Now, by independence, we have for instance
$$
\EE[X_i^3X_j]=\EE[X_i^3]\EE[X_j]=\EE[X_i^3]\times 0=0.
$$
and then terms of type (ii) vanish after taking expectation. The same goes for types (iv) and (v). Then all that remains after taking expectation is
\begin{align*}
\EE\left[\left(\frac{S_n}{n}\right)^4\right]
&=\frac{1}{n^4} \left( \sum_{i\leq n} \EE[X_i^4] + \sum_{\substack{ i,j\leq n \\\text{such that } i\neq j}} \EE[X_i^2]\EE[X_j^2] \right)\\
&=\frac{1}{n^4} \left(  n\EE[X_1^4] + \underbrace{n(n-1)}_{\#\text{ of pairs }(i,j)} \times \underbrace{\binom{4}{2}}_{\#\text{ of each $X_i^2X_j^2$}} \times\ \ \EE[X_1^2]\EE[X_1^2] \right)\\
&\leq \frac{1}{n^4} ( nC + n^2C' )\leq \frac{C''}{n^2},
\end{align*}
where $C,C',C''$ are some constants depending on the law of the $X_n$'s.

This proves, by swapping $\sum$ and $\EE$, that
$$
\EE[\sum_{n\geq 1}(S_n/n)^4]=\sum_{n\geq 1}\EE\left[(S_n/n)^4\right]\leq \sum_{n\geq 1}C''/n^2 <+\infty,
$$
and then the series $\sum_{n\geq 1}(S_n/n)^4$ is almost surely finite. But recall that if a series $\sum a_n$ converges then $(a_n)\to 0$. Hence, almost surely, $(S_n/n)^4 \to 0$ and then $S_n/n\to 0$.

For the general case $\mu\neq 0$, it suffices to write
$$
\frac{S_n}{n}-\mu=\frac{(X_1-\mu)+\dots +(X_n-\mu)}{n}
$$
which goes to zero by the previous arguments, since $(X_i-\mu)$ has zero expectation.
\end{proof}
}
Let us turn back to the example of coin tosses. The strong LLN shows that the frequency $\displaystyle{\frac{H_n}{n}}$ of Heads converges to $1/2$, almost surely. Here you may see the meaning of "almost surely": $\Omega=\set{H,T}^\bbN$, and there are $\omega \in\Omega$ such that $\displaystyle{\frac{H_n(\omega)}{n}}$ does not go to $1/2$, for instance
$$
\omega=(H,H,H,H,H,\dots ),
$$
for which $\displaystyle{\frac{H_n(\omega)}{n}=1}$. The strong LLN shows that such $\omega$'s form a set of measure zero.

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Convergences of distributions}
We now discuss a very different kind of convergence: convergence of distributions of random variables instead of convergence of random variables themselves.
Let $X_1,X_2,\dots$ be random variables (unlike in the previous section, they may be defined in different probability spaces).
\begin{defi}
One says that $(X_n)_{n\geq 1}$ converges \emph{in distribution} (or \emph{in law}) to $X$ if for every bounded and continuous function $\phi$ we have
$$
\lim_{n\to +\infty} \EE[\phi(X_n)] = \EE[\phi(X)].
$$
One writes $(X_n) \stackrel{\text{(law)}}{\to} X$.
\end{defi}
\noindent{\bf A few words about notations: }\emph{This kind of convergence is very different in that it regards laws rather than random variables. To see why, observe that if $X_1,X_2,\dots$ are identically distributed, then for all $n$, $X_n$ has the same law as $X_1$ and as $X_2$, so that
$$
(X_n) \stackrel{\text{(law)}}{\to} X_1\text{ but also }(X_n) \stackrel{\text{(law)}}{\to} X_2,
$$
and you see that the limit is not unique. In fact, it would be more proper to write that \emph{the law} of $X_n$ converges to \emph{the law} of $X$: you will sometimes find the notation
$$
\mathbb{P}_{X_n} \stackrel{\text{(law)}}{\to} \mathbb{P}_{X},
$$
One also says that $\mathbb{P}_{X_n}$ converges \emph{weakly}\index{weak convergence} to $\mathbb{P}_{X}$.}

\medskip

Convergence in distribution is in fact the "weakest" of all kinds of convergence:
\begin{prop}
Let $X$ and $(X_n)_{n\geq 1}$ be random variables defined on the same probability space. If $(X_n) \stackrel{\text{prob.}}{\to} X$ then $(X_n) \stackrel{\text{(law)}}{\to} X$.
\end{prop}
{\small
\begin{proof}
Take a continuous function $\phi$ bounded by some $A>0$. For a sake of simplicity, we will assume furthermore that $\phi$ is not only bounded and continuous but also Lipschitz: there exists $c>0$ such that for all $x,y\in\bbR$
$$
|\phi(x)-\phi(y)|\leq c|x-y|.
$$
Fix $\eps>0$ and  write
$$
\begin{array}{r l c c c}
\left|\EE[\phi(X_n)]-\EE[\phi(X)]\right|
&\leq& \EE[\left|\phi(X_n)-\phi(X)\right|] & &\\
&=& \EE\Big[\underbrace{\left|\phi(X_n)-\phi(X)\right|}_{\leq c|X_n-X|\text{ since $\phi$ is Lip.}} \mathds{1}_{|X_n-X|\leq \eps}\Big]
&+&\EE\Big[\underbrace{\left|\phi(X_n)-\phi(X)\right|}_{\leq 2A}\mathds{1}_{|X_n-X|>\eps}\Big]\\
&\leq& \EE\left[c|X_n-X| \mathds{1}_{|X_n-X|\leq \eps}\right]
&+&\EE\Big[\underbrace{\left|\phi(X_n)-\phi(X)\right|}_{\leq 2A}\mathds{1}_{|X_n-X|>\eps}\Big]\\
&\leq& \EE[c\eps \mathds{1}_{|X_n-X|\leq \eps}]
&+&\EE\left[2A \mathds{1}_{|X_n-X|>\eps}\right]\\
&\leq& c\eps
&+&2A \mathbb{P}(|X_n-X|>\eps),
\end{array}
$$
and the last probability goes to zero by assumption. This proves that
$$
\lim_{n\to +\infty}\left|\EE[\phi(X_n)]-\EE[\phi(X)]\right|\leq c\eps
$$
for any $\eps>0$, so the limit is zero.
\end{proof}}
In practice, we often do not compute $\EE[\phi(X_n)]$, but rather use one of the two following criteria:
\begin{prop}\label{Prop:PorteManteau}
The three following conditions are equivalent:
\begin{enumerate}
\item $(X_n) \stackrel{\text{(law)}}{\to} X$,
\item $F_{X_n}(t) \to F_X(t)$ for every real $t$ such that $F_X$ is continuous at $t$,
\item $\Phi_{X_n}(t) \to \Phi_X(t)$ for every real $t$.
\end{enumerate}
\end{prop}
We use 2. or 3. according to how much it is easy to compute $F_{X_n}(t)$ or $\Phi_{X_n}(t)$.

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
As an application, let us prove the \emph{Law of rare events}: a sum of independent Bernoulli random variables with a small parameter is approximately distributed as a Poisson random variable.
\begin{prop}[The Law of rare\index{rare events (law of)} events]\label{Prop:Rare}
Let $\lambda >0$,
$$
\mathrm{Binom}(n,\lambda/n) \stackrel{\text{(law)}}{\to} \mathrm{Poisson}(\lambda).
$$
\end{prop}
This proposition partly explains why the Poisson distribution is so interesting for modeling.
\begin{center}
\includegraphics[width=8cm]{Figures/HistoBinomPoisson.jpg}\\
\emph{Probabilities of the number of successes in $30$ Bernoulli trials with $10\%$ success are well approximated by the $\mathrm{Poisson}(3)$ (left: $\mathrm{Binom}(30,3/30)$, right: $\mathrm{Poisson}(3)$).}
\end{center}
\medskip
{\small
\begin{proof}[of Proposition \ref{Prop:Rare}] Let $B_n\sim \mathrm{Binom}(n,\lambda/n)$, we use characteristic functions.
\begin{align*}
\EE[\exp(itB_n)]= \sum_{k=0}^n e^{itk}\Prob(B_n=k) &=\sum_{k=0}^n e^{itk}\binom{n}{k}(\tfrac{\lambda}{n})^k(1-\tfrac{\lambda}{n})^{n-k}\\
&= \sum_{k=0}^n \binom{n}{k}(e^{it}\tfrac{\lambda}{n})^k(1-\tfrac{\lambda}{n})^{n-k}\\
&= \left(1-\tfrac{\lambda}{n}+e^{it}\tfrac{\lambda}{n}\right)^n \text{ (by the binomial identity).}
\end{align*}
Now, for each $t$
$$
\EE[\exp(itB_n)]= \left(1+\frac{\lambda e^{it}-\lambda}{n}\right)^n \to \exp\left(\lambda e^{it}-\lambda \right),
$$
where we used $(1+u/n)^n\to \exp(u)$, which is true for real and also (but this is not so easy to prove) for complex numbers. Now it remains to prove that $\exp(\lambda e^{it}-\lambda)$ is the CF of a r.v. $X$ having the Poisson distribution with parameter $\lambda$:
$$
\EE[\exp(itX)]=\sum_{k\geq 0} e^{itk}e^{-\lambda}\frac{\lambda^k}{k!}=\sum_{k\geq 0} e^{-\lambda}\frac{(e^{it}\lambda)^k}{k!}= \exp\left(\lambda e^{it}-\lambda \right).
$$
\end{proof}}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{The Central Limit Theorem}
Let $X_1,X_2,\dots$ be i.i.d. random variables with zero mean and finite variance $\sigma^2$. What can we say about $S_n=X_1+X_2+\dots +X_n$ when $n$ is large?\\
We know that $\frac{S_n}{n}$ goes to zero almost surely, but we still don't know if $S_n$ is of order $\sqrt{n},\sqrt[3]{n},\log(n),\dots$

We can make an educated guess. Let $\alpha>0$ and let us compute\footnote{This approach should remind you of the proof of the weak LLN.}
$$
\EE\left[\left(\frac{S_n}{n^\alpha}\right)^2\right] = \frac{1}{n^{2\alpha}} \EE[(S_n)^2] =\frac{1}{n^{2\alpha}}\mathrm{Var}(S_n)=\frac{1}{n^{2\alpha}}n\mathrm{Var}(X_1)=\frac{\sigma^2}{n^{2\alpha-1}}.
$$
This goes to zero if $2\alpha-1>0$, to infinity if $2\alpha-1<0$, then something interesting seems to happen for $\alpha=1/2$, \emph{i.e.} for the sequence $S_n/\sqrt{n}$. 

%We turn back to the general case where $X_i$'s have arbitrary mean $\mu\neq 0$.
\begin{theo}[The Central Limit\index{CLT (Central Limit Theorem)} Theorem]\label{th:CLT}
Let $(X_n)_{n\geq 1}$ be i.i.d. random variables with finite variance. Set
$\mu=\EE[X_1]$ and $\sigma^2=\mathrm{Var}(X_1)$,
$$
\frac{S_n -n\mu}{\sigma\sqrt{n}} \stackrel{\text{(law)}}{\to} \mathcal{N}(0,1),
$$
where $S_n=X_1+X_2+\dots +X_n$.
\end{theo}
The CLT says that $S_n$ has \emph{gaussian fluctuations} around is mean $n\mu$. One can (loosely) interpret the Theorem as
$$
S_n \approx n\mu +\sigma\sqrt{n}Z,
$$
where $Z\sim \mathcal{N}(0,1)$.
\begin{center}
\includegraphics[width=10cm]{Figures/HistoGauss.jpg}\\
\emph{Probabilities of the number of successes in $30$ Bernoulli trials with $50\%$ success are bell-shaped.}
\end{center}
\medskip

\begin{proof}[of the Central Limit Theorem]
For simplicity, we make the proof in the particular case where 
$$
\Prob(X_n=1)=\Prob(X_n=-1)=1/2,
$$
we have $\mu=0$, $\sigma^2=1$. The proof with an arbitrary distribution for the $X_n$'s is very similar. We will prove that for all $t$
\begin{equation}\tag{$\star$}\label{Eq:ProofCLT}
\Phi_{S_n/\sqrt{n}}(t) \to \exp(-t^2/2),
\end{equation}
since $\exp(-t^2/2)$ is the characteristic function of a $\mathcal{N}(0,1)$.
\begin{align*}
\Phi_{S_n/\sqrt{n}}(t)= \EE\left[ e^{it S_n/\sqrt{n}}\right]
&= \EE\left[ e^{it\frac{(X_1+\dots +X_n)}{\sqrt{n}}}\right]\\
&= \EE\left[ e^{it\frac{X_1}{\sqrt{n}}}\times \dots \times e^{it\frac{X_n}{\sqrt{n}}}\right]\\
&= \EE\left[ e^{it\frac{X_1}{\sqrt{n}}}\right]^n \qquad \text{ (by independence)}.\\
\end{align*}
Now,
$$
\EE\left[ e^{it\frac{X_1}{\sqrt{n}}}\right]=  \Prob(X_1=+1)\times e^{it\frac{+1}{\sqrt{n}}}+\Prob(X_1=-1)\times e^{it\frac{-1}{\sqrt{n}}}= \frac{e^{it/\sqrt{n}}+e^{-it/\sqrt{n}}}{2},
$$
and recall that $e^{it}=\cos(t)+i\sin(t)$, so that $\frac{e^{it}+e^{-it}}{2}=\cos(t)$. Thus
$$
\EE\left[ e^{it\frac{X_1}{\sqrt{n}}}\right]=\cos\left(\frac{t}{\sqrt{n}} \right)=1-\frac{t^2}{2n}+\mathrm{o}(1/n)
$$
(recall $\cos(u)=1-u^2/2 +\mathrm{o}(u^2)$). Finally,
\begin{align*}
\Phi_{S_n/\sqrt{n}}(t)&=\left(1-\frac{t^2}{2n}+\mathrm{o}(1/n)\right)^n\\
&=\exp\left(n \log\left( 1-\frac{t^2}{2n}+\mathrm{o}(1/n)\right) \right)\\
&=\exp\left(n \left( -\frac{t^2}{2n}+\mathrm{o}(1/n)\right) \right)\qquad \text{(since $\log(1+u)=u+\mathrm{o}(u)$)}\\
&\stackrel{n\to +\infty}{\to} \exp(-t^2/2)=\Phi_Z(t),
\end{align*}
and the proof of \eqref{Eq:ProofCLT} is over.
\end{proof}
\subsubsection*{$\rhd\ $Application of the CLT: \index{confidence intervals}confidence intervals}
{\footnotesize
Let us flip $n$ times an unfair coin that turns Heads with probability $p$. Let $H_n$ be the number of Heads in the first n tosses, $H_n\sim \mathrm{Binom}(n,p)$ and we can write
$$
H_n=X_1 +\dots +X_n,
$$
where $X_k$'s are i.i.d. with $\Prob(X_k=1)=p$, $\Prob(X_k=0)=1-p$. Do check that $\EE[X_k]=p$, $\mathrm{Var}(X_k)=p(1-p)$. Then the CLT says that
$$
\frac{H_n -np}{\sqrt{p(1-p)}\sqrt{n}} \stackrel{\text{(law)}}{\to} Z,
$$
where $Z\sim \mathcal{N}(0,1)$. Proposition \ref{Prop:PorteManteau} says then that, for any reals $a,b$,
$$
\Prob\left(a\leq \frac{H_n -np}{\sqrt{p(1-p)}\sqrt{n}}\leq b\right) \stackrel{n\to +\infty}{\to} \Prob(a\leq Z\leq b),
$$
since $F_Z(t)$ is continuous for every $t$.
Take for instance $a$ such that $\Prob(-a\leq Z\leq a)=.95$ (with a computer one finds $a\approx 1.96$), this formula rewrites
$$
\Prob\left(\frac{H_n}{n} \in \left[p\pm 1.96\frac{\sqrt{p(1-p)}}{\sqrt{n}}\right]\right) \stackrel{n\to +\infty}{\to} 95\%.
$$
\begin{center}\emph{
$\Rightarrow$ With $95\%$ chance, the frequency of Heads after $n$ flips is close to $p$ within $\frac{\text{constant}}{\sqrt{n}}$.}
\end{center}
Observe now that for $0<p<1$, one has $p(1-p)\leq 1/4$, so that $1.96\sqrt{p(1-p)}\leq 1.96/2< 1$. Then
\begin{align*}
\left[p\pm \frac{1}{\sqrt{n}}\right] &\supset \left[p\pm 1.96\frac{\sqrt{p(1-p)}}{\sqrt{n}}\right],\text{ and thus}\\
\Prob\left(\frac{H_n}{n} \in \left[p\pm \frac{1}{\sqrt{n}}\right]\right)&\geq  \Prob\left(\frac{H_n}{n} \in \left[p\pm 1.96\frac{\sqrt{p(1-p)}}{\sqrt{n}}\right]\right) \stackrel{n\to +\infty}{\to} 95\%.\\
\end{align*}
It means that with probability higher than $95\%$ one has
$
\left|\frac{H_n}{n}-p \right|\leq \frac{1}{\sqrt{n}}.
$
Let us now reverse the question: imagine that $p$ is unknown, this tells us that $H_n/n$ is a good estimation of $p$. One says that $[\frac{H_n}{n} \pm \frac{1}{\sqrt{n}}]$ is a $95\%$ \emph{confidence interval} for $p$.
}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\newpage
\section{Conditional expectation}
\subsubsection*{$\rhd\ $Conditional probabilities}
You already know that if $A,B$ are events of $(\Omega,\Trib,\Prob)$ such that $\Prob(B)>0$, then we define 
$$
\Prob(A|B)=\frac{\Prob(A\cap B)}{\Prob(B)},
$$
this is a prediction made on $A$, given that $B$ occurs. One also can set
$$
\EE[X|B]=\frac{\EE[X\mathds{1}_B]}{\Prob(B)}.
$$
We consider now a different but related problem: what is the best prediction that can be made on $X$, given another random variable $Y$. This should depend on $Y$, and thus be a random variable. 

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Conditional expectation as the orthogonal projection}
\newcommand{\Tribb}{\mathcal{B}}
We need a more abstract point of view, this approach is due to Kolmogorov.
Consider a probability space $(\Omega,\Trib,\Prob)$, recall that
$$
L^2=\set{\text{ random variables }X\text{ such that }\EE[X^2]<+\infty},
$$
where it is understood that $X$ is measurable with respect to $\Trib$. Recall also that $L^2$ is equipped with the scalar product $\EE[XY]$, we say that $X,Y$ are orthogonal if $\EE[XY]=0$.\\
If $\Tribb\subset \Trib$ is a sub-$\sigma$-algebra of $\Trib$ we set
$$
L^2(\Tribb)=\set{X\in L^2\text{ and $X$ is measurable w.r.t }\Tribb}.
$$
\begin{defi} Let $X\in L^2(\Trib)$, the conditional expectation of $X$ given $\Tribb$, denoted by $\EE[X|\Tribb]$, is the orthogonal projection of $X$ onto $L^2(\Tribb)$. It is a random variable measurable with respect to $\Tribb$.
\end{defi}
It is interpreted as the \emph{best prediction} on $X$, given $\Tribb$.

\medskip

\noindent If $Y$ is another random variable, then we set
$$
\EE[X|Y]=\EE[X|\sigma(Y)],
$$
where $\sigma(Y)$ is the $\sigma$-algebra generated by $Y$. 
Recall that by Theorem \ref{Th:sigmaY}: 
\begin{center}
$Z$ is measurable w.r.t. $\sigma(Y)$ $\Leftrightarrow$ $Z$ can be written in the form $\phi(Y)$
\end{center}
where $\phi$ is a Borel function. Thus, in $L^2(\sigma(Y))$ you can find all constants, $\cos(Y)$, $Y^2$, $\exp(Y)$, $\mathds{1}_{Y\geq 0}$,... And since $\EE[X|Y]$ is $\sigma(Y)$-measurable,
\begin{center}\emph{
$\Rightarrow$ $\EE[X|Y]$ is a random variable which is a function of $Y$.}
\end{center}
\begin{center}
\includegraphics[width=12cm]{Figures/Projection.pdf}
\end{center}
Let us now see how to handle this abstract definition. By construction $(X-\EE[X|Y])$ is orthogonal to $L^2(\sigma(Y))$, and then orthogonal to every random variable $\phi(Y)$:
$$
\EE\left[(X-\EE[X|Y])\phi(Y) \right]=0,
$$
which can be rewritten as $\EE[X\phi(Y)]=\EE\left[\EE[X|Y]\phi(Y)\right]$, this is an important property:
\begin{prop}[Characteristic property of the conditional expectation]\label{Prop:CharacEspCond}\ \\
Let $X,Y$ be two random variables in $L^2$. 
\begin{itemize}
\item $\EE[X|Y]$ is the only $\sigma(Y)$-measurable function such that
$$
\EE[X\phi(Y)]=\EE\left[\EE[X|Y]\phi(Y)\right]
$$
for every Borel function $\phi$ such that $\phi(Y)\in L^2$.
\item $\EE[X|\Tribb]$ is the only $\Tribb$-measurable function such that
$$
\EE[XU]=\EE\left[\EE[X|\Tribb]U\right]
$$
for every $U\in L^2(\Tribb)$.
\end{itemize}
\end{prop}
\emph{(Note that with these assumptions, Cauchy-Schwarz's inequality says 
$$
\EE[|X\phi(Y)|]\leq \EE[X^2]^{1/2}\EE[\phi(Y)^2]^{1/2}<+\infty
$$
 and then $\EE[X\phi(Y)]$ is well-defined.)}

This definition of $\EE[X|Y]$ is not constructive at all, but we will see below that Proposition \ref{Prop:CharacEspCond} allows us to compute conditional expectations.
First, let us record some important properties of the conditional expectation.
\newpage
\begin{prop}[Properties of conditional expectations]\label{Prop:EspCond}
\begin{enumerate}[(i)]
\item {\bf (Linearity)} $\EE[aX+X'|\Tribb]=a\EE[X|\Tribb]+\EE[X'|\Tribb]$ for all r.v. $X,X'$ and constant $a$.
\item {\bf (Averaging)} $\EE\left[\EE[X|\Tribb]\right]=\EE[X]$.
\item {\bf ('Taking out what is known')} If $Z$ is $\Tribb$-measurable, then $\EE[ZX|\Tribb]=Z\EE[X|\Tribb]$.\\ In particular, $\EE[Z|\Tribb]=Z$ if $Z$ is $\Tribb$-measurable.
\item {\bf (Independence)} If $X$ is independent of $\Tribb$ then $\EE[X|\Tribb]=\EE[X]$.
\end{enumerate}
\end{prop}
Note that in (iii) we need to assume that $ZX$ is in $L^2$.

Let us justify briefly these properties:\\
{\small 
(i) is obvious since orthogonal projection is a linear operator.\\
(ii) is obtained by taking $U=1$ in the \emph{characteristic property} $\EE[X\times 1]=\EE\left[\EE[X|\Tribb]\times 1\right]$.\\
(iii) is less obvious. First, note that $Z\EE[X|\Tribb]$ is $\Tribb$-measurable so it is a candidate to be $\EE[ZX|\Tribb]$. It remains to prove that 
$$
\left(ZX-Z\EE[X|\Tribb]\right) \perp \Tribb.
$$
Let $U$ be $\Tribb$-measurable, we have
$$
\EE\left[(ZX-Z\EE[X|\Tribb])U \right]=\EE\left[(X-\EE[X|\Tribb])ZU \right]=0,
$$
since $X-\EE[X|\Tribb]$ is orthogonal to $ZU$ which is in $\Tribb$.
This proves that $Z\EE[X|\Tribb]$ is the orthogonal projection of $ZX$ onto $\Tribb$.\\
(iv) is intuitive: $\Tribb$ doesn't bring information about $X$. To prove so, first note that $\EE[X]$, which is a constant, is $\Tribb$-measurable and then is a candidate to be $\EE[X|\Tribb]$. Let $U$ be $\Tribb$-measurable, $X$ is independent of $U$ and then
$$
\EE[XU]=\EE[X]\EE[U]=\EE\left[\EE[X]U\right],
$$
\emph{i.e.} $\EE[X]$ checks the \emph{characteristic property}.
}

\medskip

\noindent In order to see how to use these properties, let us see a very IMPORTANT example.
\begin{example}
Let $X_1,X_2,\dots$ be i.i.d. random variables with zero mean and finite variance, set as usual $S_n=X_1+\dots +X_n$, $S_n$ is then your fortune at the $n$-th step of a fair game.

Given $S_n$, what is the best prediction for $S_{n+1}$?
$$
\begin{array}{r c l l}
\EE[S_{n+1}|S_n]&=&\EE[S_{n}+X_{n+1}|S_n]&\\
                &=&\EE[S_{n}|S_n]+\EE[X_{n+1}|S_n]& \text{ \emph{(by linearity)}}\\
                &=&S_{n}+\EE[X_{n+1}|S_n]& \text{ \emph{(by (iii))}}\\
                &=&S_{n}+\EE[X_{n+1}]& \text{ \emph{(by (iv))}}\\
		&=&S_n.
\end{array}
$$
\end{example}

\begin{exo} Let $\mathcal{E}$ be the (poor) $\sigma$-algebra defined by $\mathcal{E}=\set{\varnothing,\Omega}$.
\begin{enumerate}
\item Prove that the only random variables that are $\mathcal{E}$-measurable are the constants.
\item Prove that $\EE[X|\mathcal{E}]=\EE[X]$.
\end{enumerate}
\end{exo}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\newpage
\subsection{Conditional expectations and densities}
In the case where $(X,Y)$ has a density, there is a formula for $\EE[X|Y]$.
\begin{prop}\label{Prop:EspCondDensity}
Let $(X,Y)$ have joint density $f(x,y)$. Let $f_Y(y)=\int_x f(x,y)dx$ be the marginal density of $Y$,
$$
\EE[X|Y]=\frac{\int_x xf(x,Y)dx}{f_Y(Y)}.
$$
(If $f_Y(Y)=0$ then we set $\EE[X|Y]=0$.)\\
More generally,
$$
\EE[h(X)|Y]=\frac{\int_x h(x)f(x,Y)dx}{f_Y(Y)}.
$$
\end{prop}
\begin{example}
Let $(X,Y)$ have density
$f(x,y)=\frac{1}{x}$ if $0\leq y\leq x \leq 1$ and $0$ otherwise.
Then
$$
f_Y(y)=\int_{x} \mathds{1}_{ y\leq x \leq 1}\frac{dx}{x}=\int_{x=y}^1 \frac{dx}{x}=-\log(y).
$$
The marginal density of $X$ is
$$
f_X(x)=\int_{y} \mathds{1}_{ y\leq x \leq 1}\frac{dy}{x}=\frac{1}{x}\int_{y=0}^x dx=1.
$$
Let us apply the proposition to compute $\EE[X|Y]$:
$$
\int_x xf(x,Y)dx= \int_{x=Y}^1 x\times \frac{dx}{x}= \int_{x=Y}^1 dx= 1-Y. 
$$
Then $\EE[X|Y]=\frac{1-Y}{-\log(Y)}$ (in particular you see that $\EE[X|Y]$ is a function of $Y$). We also can compute $\EE[Y|X]$:
$$
\EE[Y|X]=\frac{\int_y yf(X,y)dy}{f_X(X)}=\frac{\int_{y=0}^X ydy/X}{1}=X/2.
$$
As an exercise, you can check on this example that
$$
\EE\left[\EE[X|Y]\right]=\EE[X]\qquad \text{ and } \qquad \EE\left[\EE[Y|X]\right]=\EE[Y].
$$
\end{example}
Let us prove the formula:
\begin{proof}[of Proposition \ref{Prop:EspCondDensity}]
The random variable 
$$
\frac{\int_x xf(x,Y)dx}{f_Y(Y)}
$$
is a function of $Y$, and then is $\sigma(Y)$-measurable. Thus it is a candidate to be $\EE[X|Y]$, let us prove that it checks the \emph{characteristic property} of the conditional expectation.

Let $\phi$ be a Borel function such that $\phi(Y)\in L^2$, $Y$ has density $f_Y$ so
\begin{align*}
\EE\left[\frac{\int_x xf(x,Y)dx}{f_Y(Y)}\phi(Y)\right]
&= \int_y \left(\frac{\int_x xf(x,y)dx}{f_Y(y)}\phi(y)\right) f_Y(y)dy\\
&= \int_y \left(\int_x xf(x,y)dx\right) \phi(y)dy\\
&= \int\hspace{-3mm}\int x\phi(y) f(x,y)dxdy\qquad \text{ (by the 2d Fubini Theorem)}\\
&=\EE[X\phi(Y)],
\end{align*}
which proves the characteristic property.
\end{proof}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsubsection*{$\rhd\ $Conditional distributions}

\begin{defitheo}
Let $(X,Y)$ have joint density $f(x,y)$, denote by $f_Y(y)$ the marginal density of $Y$. The \emph{conditional density} of $X$ given $Y$ is the function
$$
f_{X|Y}(x,y)=\frac{f(x,y)}{f_Y(y)}.
$$
For each fixed $y$, $x\mapsto f_{X|Y}(x,y)$ is a probability density, it is called the \emph{law of $X$ conditional on $Y=y$} (this is an abuse of notation since "$Y=y$" is an event of measure zero).
\end{defitheo}


An important case is when $X,Y$ are independent:
$$
f_{X|Y}(x,y)=\frac{f(x,y)}{f_Y(y)}\stackrel{\text{(by ind.)}}{=}\frac{f_X(x)f_Y(y)}{f_Y(y)}=f_X(x),
$$
as expected. With this definition, we have
$$
\EE[X|Y]=\int_x xf_{X|Y}(x,Y)dx,
$$
which is indeed a function of $Y$. And we recover (ii) in Proposition \ref{Prop:EspCond}:
\begin{align*}
\EE\left[\EE[X|Y]\right]&=\int_y \left(\int_x xf_{X|Y}(x,Y)dx \right) f_Y(y)dy\\
&=\int\hspace{-2mm}\int x \frac{f(x,y)}{f_Y(y)}f_Y(y)dx dy \qquad\text{ (by Fubini)}\\
&=\int\hspace{-2mm}\int x f(x,y)dx dy =\EE[X].
\end{align*}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\newpage
\printindex
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%\newpage
\subsection*{Some useful references}

\bigskip

\begin{itemize}
\item G.Grimmett, D.Stirzaker. \emph{Probability and Random Processes}, Oxford Univ.Press.\\
A very nice and comprehensive introduction to Theoretical Probability, with many examples. This book also covers Random Processes (Martingales, Markov chains, Brownian motion), which is the purpose of your next course.

\item R.Durrett. \emph{Probability: Theory and Examples}, Cambridge Univ.Press.\\
quoting from the cover: "This book is an introduction to probability theory covering laws of large numbers, central limit theorems, random walks, martingales, Markov chains, ergodic theorems, and Brownian motion." Slightly more technical than Grimmett-Stirzaker.

\item D.Williams. \emph{Probability with Martingales}, Cambridge Univ.Press.\\
A very elegant book with a self-contained measure theory, but very theoretical (almost no examples). Yet this is very nice way to learn about conditional expectations and martingales. 

\item S.M\'el\'eard. \emph{Al\'eatoire}, \'Editions de l'\'Ecole Polytechnique.\\
Downloadable at \verb|http://catalogue.polytechnique.fr/|\\
For those who read french, this is a perfect material for this course. It covers Chapters 1 to 5, with more mathematical details and more about discrete probability and combinatorics, but very few about conditional expectations.
\end{itemize}

\end{document}
