%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt

# Generate an un-balanced 2D dataset
np.random.seed(0)
X = np.vstack([np.random.normal(0, 1, (950, 2)),
               np.random.normal(-1.8, 0.8, (50, 2))])
y = np.hstack([np.zeros(950), np.ones(50)])

plt.scatter(X[:, 0], X[:, 1], c=y, edgecolors='none',
            cmap=plt.cm.Accent);

from sklearn.svm import SVC
# First instantiate the "Support Vector Classifier" (SVC) model


# Next split the data (X and y) into a training and test set


# fit the model to the training data


# compute y_pred, the predicted labels of the test data


# Now that this is finished, we'll compute the classification rate
# print "accuracy:", np.sum(ytest == ypred) * 1. / len(y_test)

# Run the following to load the solution:
#  %load solutions/06-1_svm_class.py

from sklearn.cross_validation import train_test_split
from sklearn import metrics
from sklearn.svm import SVC

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
clf = SVC().fit(X_train, y_train)
y_pred = clf.predict(X_test)

print "accuracy:", metrics.accuracy_score(y_test, y_pred)
print "precision:", metrics.precision_score(y_test, y_pred)
print "recall:", metrics.recall_score(y_test, y_pred)
print "f1 score:", metrics.f1_score(y_test, y_pred)

print metrics.classification_report(y_test, y_pred,
                                    target_names=['background', 'foreground'])


# Run the following to load the solution:
#  %load solutions/06-2_unbalanced.py

X1, X2, y1, y2 = train_test_split(X, y, test_size=0.5)
print X1.shape
print X2.shape

y2_pred = SVC().fit(X1, y1).predict(X2)
y1_pred = SVC().fit(X2, y2).predict(X1)

print np.mean([metrics.precision_score(y1, y1_pred),
               metrics.precision_score(y2, y2_pred)])

from sklearn.cross_validation import cross_val_score

# Let's do a 2-fold cross-validation of the SVC estimator
print cross_val_score(SVC(), X, y, cv=2, scoring='precision')


# Run the following to load the solution:
#  %load solutions/06-3_5fold_crossval.py

from sklearn.grid_search import GridSearchCV

clf = SVC()
Crange = np.logspace(-2, 2, 40)

grid = GridSearchCV(clf, param_grid={'C': Crange},
                    scoring='precision', cv=5)
grid.fit(X, y)

print "best parameter choice:", grid.best_params_

scores = [g[1] for g in grid.grid_scores_]
plt.semilogx(Crange, scores);

from sklearn.neighbors import KNeighborsClassifier
from sklearn.datasets import load_digits
digits = load_digits()
X, y = digits.data, digits.target

# construct the K Neighbors classifier


# Use GridSearchCV to find the best accuracy given choice of ``n_neighbors``


# Plot the accuracy as a function of the number of neighbors.
# Does this change significantly if you use more/fewer folds?

# Run the following to load the solution:
#  %load solutions/06-4_gridsearch.py

def test_func(x, err=0.5):
    y = 10 - 1. / (x + 0.1)
    if err > 0:
        y = np.random.normal(y, err)
    return y

def make_data(N=40, error=1.0, random_seed=1):
    # randomly sample the data
    np.random.seed(1)
    X = np.random.random(N)[:, np.newaxis]
    y = test_func(X.ravel(), error)
    
    return X, y

X, y = make_data(40, error=1)
plt.scatter(X.ravel(), y);

X_test = np.linspace(-0.1, 1.1, 500)[:, None]

from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X, y)
y_test = model.predict(X_test)

plt.scatter(X.ravel(), y)
plt.plot(X_test.ravel(), y_test)
print "mean squared error:", metrics.mean_squared_error(model.predict(X), y)

class PolynomialRegression(LinearRegression):
    """Simple Polynomial Regression to 1D data"""
    def __init__(self, degree=1, **kwargs):
        self.degree = degree
        LinearRegression.__init__(self, **kwargs)
        
    def fit(self, X, y):
        if X.shape[1] != 1:
            raise ValueError("Only 1D data valid here")
        Xp = X ** (1 + np.arange(self.degree))
        return LinearRegression.fit(self, Xp, y)
        
    def predict(self, X):
        Xp = X ** (1 + np.arange(self.degree))
        return LinearRegression.predict(self, Xp)

model = PolynomialRegression(degree=2)
model.fit(X, y)
y_test = model.predict(X_test)

plt.scatter(X.ravel(), y)
plt.plot(X_test.ravel(), y_test)
print "mean squared error:", metrics.mean_squared_error(model.predict(X), y)

model = PolynomialRegression(degree=30)
model.fit(X, y)
y_test = model.predict(X_test)

plt.scatter(X.ravel(), y)
plt.plot(X_test.ravel(), y_test)
plt.ylim(-4, 14)
print "mean squared error:", metrics.mean_squared_error(model.predict(X), y)

degrees = np.arange(1, 30)

X, y = make_data(100, error=1.0)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

training_error = []
test_error = []
mse = metrics.mean_squared_error

for d in degrees:
    model = PolynomialRegression(d).fit(X_train, y_train)
    training_error.append(mse(model.predict(X_train), y_train))
    test_error.append(mse(model.predict(X_test), y_test))
    
# note that the test error can also be computed via cross-validation
plt.plot(degrees, training_error, label='training')
plt.plot(degrees, test_error, label='test')
plt.legend()
plt.xlabel('degree')
plt.ylabel('MSE');

from sklearn.tree import DecisionTreeRegressor

# Run the following to load the solution:
#  %load solutions/06-5_decisiontree.py

# Run the following to load the solution:
#  %load solutions/06-6_randomforest.py

X, y = make_data(200, error=1.0)
degree = 3

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

N_range = np.linspace(15, X_train.shape[0], 20).astype(int)

def plot_learning_curve(degree=3):
    training_error = []
    test_error = []
    mse = metrics.mean_squared_error
    
    for N in N_range:
        XN = X_train[:N]
        yN = y_train[:N]
    
        model = PolynomialRegression(degree).fit(XN, yN)
        training_error.append(mse(model.predict(XN), yN))
        test_error.append(mse(model.predict(X_test), y_test))
      
    plt.plot(N_range, training_error, label='training')
    plt.plot(N_range, test_error, label='test')
    plt.plot(N_range, np.ones_like(N_range), ':k')
    plt.legend()
    plt.title('degree = {0}'.format(degree))
    plt.xlabel('num. training points')
    plt.ylabel('MSE')
    
plot_learning_curve(3)

plot_learning_curve(2)

plot_learning_curve(5)


# As this final exercise is fairly open-ended, there is no solution provided!