%matplotlib inline import numpy as np import matplotlib.pyplot as plt # Generate an un-balanced 2D dataset np.random.seed(0) X = np.vstack([np.random.normal(0, 1, (950, 2)), np.random.normal(-1.8, 0.8, (50, 2))]) y = np.hstack([np.zeros(950), np.ones(50)]) plt.scatter(X[:, 0], X[:, 1], c=y, edgecolors='none', cmap=plt.cm.Accent); from sklearn.svm import SVC # First instantiate the "Support Vector Classifier" (SVC) model # Next split the data (X and y) into a training and test set # fit the model to the training data # compute y_pred, the predicted labels of the test data # Now that this is finished, we'll compute the classification rate # print("accuracy:", np.sum(ytest == ypred) * 1. / len(y_test)) from sklearn.cross_validation import train_test_split from sklearn import metrics from sklearn.svm import SVC X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) clf = SVC().fit(X_train, y_train) y_pred = clf.predict(X_test) print("accuracy:", metrics.accuracy_score(y_test, y_pred)) print("precision:", metrics.precision_score(y_test, y_pred)) print("recall:", metrics.recall_score(y_test, y_pred)) print("f1 score:", metrics.f1_score(y_test, y_pred)) print(metrics.classification_report(y_test, y_pred, target_names=['background', 'foreground'])) X1, X2, y1, y2 = train_test_split(X, y, test_size=0.5) print(X1.shape) print(X2.shape) y2_pred = SVC().fit(X1, y1).predict(X2) y1_pred = SVC().fit(X2, y2).predict(X1) print([metrics.precision_score(y1, y1_pred), metrics.precision_score(y2, y2_pred)]) from sklearn.cross_validation import cross_val_score # Let's do a 2-fold cross-validation of the SVC estimator print(cross_val_score(SVC(), X, y, cv=2, scoring='precision')) from sklearn.grid_search import GridSearchCV clf = SVC() Crange = np.logspace(-2, 2, 40) grid = GridSearchCV(clf, param_grid={'C': Crange}, scoring='precision', cv=5) grid.fit(X, y) print("best parameter choice:", grid.best_params_) scores = [g[1] for g in grid.grid_scores_] plt.semilogx(Crange, scores); from sklearn.neighbors import KNeighborsClassifier from sklearn.datasets import load_digits digits = load_digits() X, y = digits.data, digits.target # construct the K Neighbors classifier # Use GridSearchCV to find the best accuracy given choice of ``n_neighbors`` # Plot the accuracy as a function of the number of neighbors. # Does this change significantly if you use more/fewer folds? def test_func(x, err=0.5): y = 10 - 1. / (x + 0.1) if err > 0: y = np.random.normal(y, err) return y def make_data(N=40, error=1.0, random_seed=1): # randomly sample the data np.random.seed(1) X = np.random.random(N)[:, np.newaxis] y = test_func(X.ravel(), error) return X, y X, y = make_data(40, error=1) plt.scatter(X.ravel(), y); X_test = np.linspace(-0.1, 1.1, 500)[:, None] from sklearn.linear_model import LinearRegression model = LinearRegression() model.fit(X, y) y_test = model.predict(X_test) plt.scatter(X.ravel(), y) plt.plot(X_test.ravel(), y_test) print("mean squared error:", metrics.mean_squared_error(model.predict(X), y)) from sklearn.preprocessing import PolynomialFeatures from sklearn.linear_model import LinearRegression from sklearn.pipeline import make_pipeline def PolynomialRegression(degree, **kwargs): return make_pipeline(PolynomialFeatures(degree), LinearRegression(**kwargs)) model = PolynomialRegression(2) model.fit(X, y) y_test = model.predict(X_test) plt.scatter(X.ravel(), y) plt.plot(X_test.ravel(), y_test) print("mean squared error:", metrics.mean_squared_error(model.predict(X), y)) model = PolynomialRegression(degree=30) model.fit(X, y) y_test = model.predict(X_test) plt.scatter(X.ravel(), y) plt.plot(X_test.ravel(), y_test) plt.ylim(-4, 14) print("mean squared error:", metrics.mean_squared_error(model.predict(X), y)) from IPython.html.widgets import interact def plot_fit(degree, Npts): X, y = make_data(Npts, error=1) X_test = np.linspace(-0.1, 1.1, 500)[:, None] model = PolynomialRegression(degree=degree) model.fit(X, y) y_test = model.predict(X_test) plt.scatter(X.ravel(), y) plt.plot(X_test.ravel(), y_test) plt.ylim(-4, 14) plt.title("mean squared error: {0:.2f}".format(metrics.mean_squared_error(model.predict(X), y))) interact(plot_fit, degree=[1, 30], Npts=[2, 100]); degrees = np.arange(1, 30) X, y = make_data(100, error=1.0) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) training_error = [] test_error = [] mse = metrics.mean_squared_error for d in degrees: model = PolynomialRegression(d).fit(X_train, y_train) training_error.append(mse(model.predict(X_train), y_train)) test_error.append(mse(model.predict(X_test), y_test)) # note that the test error can also be computed via cross-validation plt.plot(degrees, training_error, label='training') plt.plot(degrees, test_error, label='test') plt.legend() plt.xlabel('degree') plt.ylabel('MSE'); from sklearn.tree import DecisionTreeRegressor X, y = make_data(200, error=1.0) degree = 3 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) N_range = np.linspace(15, X_train.shape[0], 20).astype(int) def plot_learning_curve(degree=3): training_error = [] test_error = [] mse = metrics.mean_squared_error for N in N_range: XN = X_train[:N] yN = y_train[:N] model = PolynomialRegression(degree).fit(XN, yN) training_error.append(mse(model.predict(XN), yN)) test_error.append(mse(model.predict(X_test), y_test)) plt.plot(N_range, training_error, label='training') plt.plot(N_range, test_error, label='test') plt.plot(N_range, np.ones_like(N_range), ':k') plt.legend() plt.title('degree = {0}'.format(degree)) plt.xlabel('num. training points') plt.ylabel('MSE') plot_learning_curve(3) plot_learning_curve(2) plot_learning_curve(5)