%matplotlib inline import numpy as np import matplotlib.pyplot as plt # Generate an un-balanced 2D dataset np.random.seed(0) X = np.vstack([np.random.normal(0, 1, (950, 2)), np.random.normal(-1.8, 0.8, (50, 2))]) y = np.hstack([np.zeros(950), np.ones(50)]) plt.scatter(X[:, 0], X[:, 1], c=y, edgecolors='none', cmap=plt.cm.Accent); from sklearn.svm import SVC # First instantiate the "Support Vector Classifier" (SVC) model # Next split the data (X and y) into a training and test set # fit the model to the training data # compute y_pred, the predicted labels of the test data # Now that this is finished, we'll compute the classification rate # print "accuracy:", np.sum(ytest == ypred) * 1. / len(y_test) # Run the following to load the solution: # %load solutions/06-1_svm_class.py from sklearn.cross_validation import train_test_split from sklearn import metrics from sklearn.svm import SVC X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) clf = SVC().fit(X_train, y_train) y_pred = clf.predict(X_test) print "accuracy:", metrics.accuracy_score(y_test, y_pred) print "precision:", metrics.precision_score(y_test, y_pred) print "recall:", metrics.recall_score(y_test, y_pred) print "f1 score:", metrics.f1_score(y_test, y_pred) print metrics.classification_report(y_test, y_pred, target_names=['background', 'foreground']) # Run the following to load the solution: # %load solutions/06-2_unbalanced.py X1, X2, y1, y2 = train_test_split(X, y, test_size=0.5) print X1.shape print X2.shape y2_pred = SVC().fit(X1, y1).predict(X2) y1_pred = SVC().fit(X2, y2).predict(X1) print np.mean([metrics.precision_score(y1, y1_pred), metrics.precision_score(y2, y2_pred)]) from sklearn.cross_validation import cross_val_score # Let's do a 2-fold cross-validation of the SVC estimator print cross_val_score(SVC(), X, y, cv=2, scoring='precision') # Run the following to load the solution: # %load solutions/06-3_5fold_crossval.py from sklearn.grid_search import GridSearchCV clf = SVC() Crange = np.logspace(-2, 2, 40) grid = GridSearchCV(clf, param_grid={'C': Crange}, scoring='precision', cv=5) grid.fit(X, y) print "best parameter choice:", grid.best_params_ scores = [g[1] for g in grid.grid_scores_] plt.semilogx(Crange, scores); from sklearn.neighbors import KNeighborsClassifier from sklearn.datasets import load_digits digits = load_digits() X, y = digits.data, digits.target # construct the K Neighbors classifier # Use GridSearchCV to find the best accuracy given choice of ``n_neighbors`` # Plot the accuracy as a function of the number of neighbors. # Does this change significantly if you use more/fewer folds? # Run the following to load the solution: # %load solutions/06-4_gridsearch.py def test_func(x, err=0.5): y = 10 - 1. / (x + 0.1) if err > 0: y = np.random.normal(y, err) return y def make_data(N=40, error=1.0, random_seed=1): # randomly sample the data np.random.seed(1) X = np.random.random(N)[:, np.newaxis] y = test_func(X.ravel(), error) return X, y X, y = make_data(40, error=1) plt.scatter(X.ravel(), y); X_test = np.linspace(-0.1, 1.1, 500)[:, None] from sklearn.linear_model import LinearRegression model = LinearRegression() model.fit(X, y) y_test = model.predict(X_test) plt.scatter(X.ravel(), y) plt.plot(X_test.ravel(), y_test) print "mean squared error:", metrics.mean_squared_error(model.predict(X), y) class PolynomialRegression(LinearRegression): """Simple Polynomial Regression to 1D data""" def __init__(self, degree=1, **kwargs): self.degree = degree LinearRegression.__init__(self, **kwargs) def fit(self, X, y): if X.shape[1] != 1: raise ValueError("Only 1D data valid here") Xp = X ** (1 + np.arange(self.degree)) return LinearRegression.fit(self, Xp, y) def predict(self, X): Xp = X ** (1 + np.arange(self.degree)) return LinearRegression.predict(self, Xp) model = PolynomialRegression(degree=2) model.fit(X, y) y_test = model.predict(X_test) plt.scatter(X.ravel(), y) plt.plot(X_test.ravel(), y_test) print "mean squared error:", metrics.mean_squared_error(model.predict(X), y) model = PolynomialRegression(degree=30) model.fit(X, y) y_test = model.predict(X_test) plt.scatter(X.ravel(), y) plt.plot(X_test.ravel(), y_test) plt.ylim(-4, 14) print "mean squared error:", metrics.mean_squared_error(model.predict(X), y) degrees = np.arange(1, 30) X, y = make_data(100, error=1.0) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) training_error = [] test_error = [] mse = metrics.mean_squared_error for d in degrees: model = PolynomialRegression(d).fit(X_train, y_train) training_error.append(mse(model.predict(X_train), y_train)) test_error.append(mse(model.predict(X_test), y_test)) # note that the test error can also be computed via cross-validation plt.plot(degrees, training_error, label='training') plt.plot(degrees, test_error, label='test') plt.legend() plt.xlabel('degree') plt.ylabel('MSE'); from sklearn.tree import DecisionTreeRegressor # Run the following to load the solution: # %load solutions/06-5_decisiontree.py # Run the following to load the solution: # %load solutions/06-6_randomforest.py X, y = make_data(200, error=1.0) degree = 3 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) N_range = np.linspace(15, X_train.shape[0], 20).astype(int) def plot_learning_curve(degree=3): training_error = [] test_error = [] mse = metrics.mean_squared_error for N in N_range: XN = X_train[:N] yN = y_train[:N] model = PolynomialRegression(degree).fit(XN, yN) training_error.append(mse(model.predict(XN), yN)) test_error.append(mse(model.predict(X_test), y_test)) plt.plot(N_range, training_error, label='training') plt.plot(N_range, test_error, label='test') plt.plot(N_range, np.ones_like(N_range), ':k') plt.legend() plt.title('degree = {0}'.format(degree)) plt.xlabel('num. training points') plt.ylabel('MSE') plot_learning_curve(3) plot_learning_curve(2) plot_learning_curve(5) # As this final exercise is fairly open-ended, there is no solution provided!