#!/usr/bin/env python
# coding: utf-8

# # Solutions-02

# ### Setup Code

# In[ ]:


get_ipython().run_line_magic('matplotlib', 'inline')
import matplotlib.pyplot as plt
import numpy as np
import seaborn; seaborn.set() # for plot formatting


# In[ ]:


# Making the data

def make_data(intercept, slope, N=20, dy=5, rseed=42):
    rand = np.random.RandomState(rseed)
    x = 100 * rand.rand(N)
    y = intercept + slope * x
    y += dy * rand.randn(N)
    return x, y, dy * np.ones_like(x)

theta_true = [25, 0.5]
x, y, dy = make_data(*theta_true)
plt.errorbar(x, y, dy, fmt='o');


# ### Quick Exercise
# 
# 1. Write a Python function which computes the log-likelihood given a parameter vector $\theta$, an array of errors $\varepsilon$, and an array of $x$ and $y$ values
# 
# 2. Use tools in [``scipy.optimize``](http://docs.scipy.org/doc/scipy/reference/optimize.html) to maximize this likelihood (i.e. minimize the negative log-likelihood). How close is this result to the input ``theta_true`` above?

# #### Solution

# In[ ]:


def log_likelihood(theta, x, y, dy):
    y_model = theta[0] + theta[1] * x
    return -0.5 * np.sum(np.log(2 * np.pi * dy ** 2) +
                         (y - y_model) ** 2 / dy ** 2)


# In[ ]:


from scipy import optimize

def minfunc(theta, x, y, dy):
    return -log_likelihood(theta, x, y, dy)

result = optimize.minimize(minfunc, x0=[0, 0], args=(x, y, dy))
print("result:", result.x)
print("input", theta_true)


# ### Breakout
# 
# 1. Using matplotlib, plot the posterior probability distribution for the slope and intercept, once for each prior. I would suggest using ``plt.contourf()`` or ``plt.pcolor()``. How different are the distributions?
# 
# 2. Modify the dataset – how do the results change if you have very few data points or very large errors?
# 
# 3. If you finish this quickly, try adding 1-sigma and 2-sigma contours to your plot, keeping in mind that the probabilities are not normalized! You can add them to your plot with ``plt.contour()``.

# #### Setup
# 
# These are the two prior functions we defined in the notebook:

# In[ ]:


def log_flat_prior(theta):
    if np.all(np.abs(theta) < 1000):
        return 0
    else:
        return -np.inf  # log(0)
    
def log_symmetric_prior(theta):
    if np.abs(theta[0]) < 1000:
        return -1.5 * np.log(1 + theta[1] ** 2)
    else:
        return -np.inf  # log(0)


# #### Solution

# We'll start by defining a function which takes a two-dimensional grid of likelihoods and returns 1, 2, and 3-sigma contours.
# This acts by sorting and normalizing the values and then finding the locations of the $0.68^2$, $0.95^2$, and $0.997^2$ cutoffs:

# In[ ]:


def contour_levels(grid):
    """Compute 1, 2, 3-sigma contour levels for a gridded 2D posterior"""
    sorted_ = np.sort(grid.ravel())[::-1]
    pct = np.cumsum(sorted_) / np.sum(sorted_)
    cutoffs = np.searchsorted(pct, np.array([0.68, 0.95, 0.997]) ** 2)
    return sorted_[cutoffs]


# Now we define a function to compute and plot the results of the Bayesian analysis:

# In[ ]:


def plot_results(x, y, dy,
                 slope_limits=(0.3, 0.7),
                 intercept_limits=(15, 35)):
    # 1. Evaluate the log probability on the grid (once for each prior)
    slope_range = np.linspace(*slope_limits)
    intercept_range = np.linspace(*intercept_limits)

    log_P1 = [[log_likelihood([b, m], x, y, dy) + log_flat_prior([b, m])
               for m in slope_range] for b in intercept_range]
    log_P2 = [[log_likelihood([b, m], x, y, dy) + log_symmetric_prior([b, m])
               for m in slope_range] for b in intercept_range]

    # For convenience, we normalize the probability density such that the maximum is 1
    P1 = np.exp(log_P1 - np.max(log_P1))
    P2 = np.exp(log_P2 - np.max(log_P2))

    # 2. Create two subplots and plot contours showing the results
    fig, ax = plt.subplots(1, 2, figsize=(16, 6),
                           sharex=True, sharey=True)
    
    ax[0].contourf(slope_range, intercept_range, P1, 100, cmap='Blues')
    ax[0].contour(slope_range, intercept_range, P1, contour_levels(P1), colors='black')
    ax[0].set_title('Flat Prior')

    ax[1].contourf(slope_range, intercept_range, P2, 100, cmap='Blues')
    ax[1].contour(slope_range, intercept_range, P2, contour_levels(P1), colors='black')
    ax[1].set_title('Symmetric Prior')

    # 3. Add grids and set axis labels
    for axi in ax:
        axi.grid('on', linestyle=':', color='gray', alpha=0.5)
        axi.set_axisbelow(False)
        axi.set_xlabel('slope')
        axi.set_ylabel('intercept')


# In[ ]:


plot_results(x, y, dy)


# We see that the form of the prior in this case makes very little difference to the final posterior. In general, this often ends up being the case: for all the worrying about the effect of the prior, when you have enough data to constrain your model well, the prior has very little effect.

# Let's use some different data and see what happens:

# In[ ]:


x2, y2, dy2 = make_data(*theta_true, N=3, dy=40)
plt.errorbar(x2, y2, dy2, fmt='o');


# In[ ]:


plot_results(x2, y2, dy2,
             slope_limits=(-2, 5),
             intercept_limits=(-300, 200))


# We see here that the form of the prior **does** have a clear effect in the case where the data don't constrain the model well (in this case, three points with very large error bars).
# This encodes exactly what you would scientifically expect: if you don't have very good data, it is unlikely to change your views of the world (which are of course encoded in the prior).