# start the inline backend for plotting %matplotlib inline # Import the example plot from the figures directory from fig_code import plot_sgd_separator plot_sgd_separator() #Uncomment the %load command to load the contents of the file # %load fig_code/sgd_separator.py from fig_code import plot_linear_regression plot_linear_regression() from IPython.core.display import Image, display display(Image(filename='images/iris_setosa.jpg')) print("Iris Setosa\n") display(Image(filename='images/iris_versicolor.jpg')) print("Iris Versicolor\n") display(Image(filename='images/iris_virginica.jpg')) print("Iris Virginica") from sklearn.datasets import load_iris iris = load_iris() iris.keys() n_samples, n_features = iris.data.shape print((n_samples, n_features)) print(iris.data[0]) print(iris.data.shape) print(iris.target.shape) print(iris.target) print(iris.target_names) import numpy as np import matplotlib.pyplot as plt x_index = 0 y_index = 1 # this formatter will label the colorbar with the correct target names formatter = plt.FuncFormatter(lambda i, *args: iris.target_names[int(i)]) plt.scatter(iris.data[:, x_index], iris.data[:, y_index], c=iris.target, cmap=plt.cm.get_cmap('RdYlBu', 3)) plt.colorbar(ticks=[0, 1, 2], format=formatter) plt.clim(-0.5, 2.5) plt.xlabel(iris.feature_names[x_index]) plt.ylabel(iris.feature_names[y_index]); from sklearn import datasets #datasets.fetch_ from sklearn.linear_model import LinearRegression model = LinearRegression(normalize=True) print(model.normalize) print(model) x = np.arange(10) y = 2 * x + 1 plt.plot(x, y, 'o'); # The input data for sklearn is 2D: (samples == 10 x features == 1) X = x[:, np.newaxis] print(X) print(y) # fit the model on our data model.fit(X, y) # underscore at the end indicates a fit parameter print(model.coef_) print(model.intercept_) model.residues_ from sklearn import neighbors, datasets iris = datasets.load_iris() X, y = iris.data, iris.target # create the model knn = neighbors.KNeighborsClassifier(n_neighbors=1) # fit the model knn.fit(X, y) # What kind of iris has 3cm x 5cm sepal and 4cm x 2cm petal? # call the "predict" method: result = knn.predict([[3, 5, 4, 2],]) print(iris.target_names[result]) from fig_code import plot_iris_knn plot_iris_knn() from sklearn.svm import SVC model = SVC() model.fit(X, y) result = model.predict([[3, 5, 4, 2],]) print(iris.target_names[result]) # Create some simple data import numpy as np np.random.seed(0) X = np.random.random(size=(20, 1)) y = 3 * X.squeeze() + 2 + np.random.randn(20) # Fit a linear regression to it from sklearn.linear_model import LinearRegression model = LinearRegression(fit_intercept=True) model.fit(X, y) print ("Model coefficient: %.5f, and intercept: %.5f" % (model.coef_, model.intercept_)) # Plot the data and the model prediction X_test = np.linspace(0, 1, 100)[:, np.newaxis] y_test = model.predict(X_test) plt.plot(X.squeeze(), y, 'o') plt.plot(X_test.squeeze(), y_test); X, y = iris.data, iris.target from sklearn.decomposition import PCA pca = PCA(n_components=2) pca.fit(X) X_reduced = pca.transform(X) print("Reduced dataset shape:", X_reduced.shape) import pylab as pl pl.scatter(X_reduced[:, 0], X_reduced[:, 1], c=y, cmap='RdYlBu') print("Meaning of the 2 components:") for component in pca.components_: print(" + ".join("%.3f x %s" % (value, name) for value, name in zip(component, iris.feature_names))) from sklearn.cluster import KMeans k_means = KMeans(n_clusters=3, random_state=0) # Fixing the RNG in kmeans k_means.fit(X) y_pred = k_means.predict(X) pl.scatter(X_reduced[:, 0], X_reduced[:, 1], c=y_pred, cmap='RdYlBu'); from sklearn import datasets digits = datasets.load_digits() digits.images.shape fig, axes = plt.subplots(10, 10, figsize=(8, 8)) fig.subplots_adjust(hspace=0.1, wspace=0.1) for i, ax in enumerate(axes.flat): ax.imshow(digits.images[i], cmap='binary') ax.text(0.05, 0.05, str(digits.target[i]), transform=ax.transAxes, color='green') ax.set_xticks([]) ax.set_yticks([]) # The images themselves print(digits.images.shape) print(digits.images[0]) # The data for use in our algorithms print(digits.data.shape) print(digits.data[0]) # The target label print(digits.target) from sklearn.manifold import Isomap iso = Isomap(n_components=2) data_projected = iso.fit_transform(digits.data) plt.scatter(data_projected[:, 0], data_projected[:, 1], c=digits.target, edgecolor='none', alpha=0.5, cmap=plt.cm.get_cmap('nipy_spectral', 10)); plt.colorbar(label='digit label', ticks=range(10)) plt.clim(-0.5, 9.5) from sklearn.cross_validation import train_test_split Xtrain, Xtest, ytrain, ytest = train_test_split(digits.data, digits.target) print(Xtrain.shape, Xtest.shape) from sklearn.linear_model import LogisticRegression clf = LogisticRegression(penalty='l2') clf.fit(Xtrain, ytrain) ypred = clf.predict(Xtest) from sklearn.metrics import accuracy_score accuracy_score(ytest, ypred) from sklearn.metrics import confusion_matrix print(confusion_matrix(ytest, ypred)) plt.imshow(confusion_matrix(ytest, ypred), cmap='Blues', interpolation='nearest') plt.ylabel('true') plt.xlabel('predicted'); fig, axes = plt.subplots(10, 10, figsize=(8, 8)) fig.subplots_adjust(hspace=0.1, wspace=0.1) for i, ax in enumerate(axes.flat): ax.imshow(Xtest[i].reshape(8, 8), cmap='binary') ax.text(0.05, 0.05, str(ypred[i]), transform=ax.transAxes, color='green' if (ytest[i] == ypred[i]) else 'red') ax.set_xticks([]) ax.set_yticks([]) from astroML.datasets import fetch_rrlyrae_combined from sklearn.cross_validation import train_test_split X, y = fetch_rrlyrae_combined() # For now, we'll only fit the first two colors X_train, X_test, y_train, y_test = train_test_split(X, y) N_plot = 5000 plt.scatter(X[-N_plot:, 0], X[-N_plot:, 1], c=y[-N_plot:], edgecolors='none') plt.xlabel('u-g color') plt.ylabel('g-r color'); from sklearn.neighbors import KNeighborsClassifier clf = KNeighborsClassifier(n_neighbors=5) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) print(np.sum(y_pred == y_test)) print(np.sum(y_pred != y_test)) from sklearn.metrics import classification_report print(classification_report(y_test, y_pred, target_names=['MS star', 'RR Lyrae'])) from astroML.datasets import fetch_sdss_specgals data = fetch_sdss_specgals() # put magnitudes in a matrix X = np.vstack([data['modelMag_%s' % f] for f in 'ugriz']).T y = data['z'] # Split into training and testing data X_train, X_test, y_train, y_test = train_test_split(X, y) from sklearn.linear_model import LinearRegression est = LinearRegression() est.fit(X_train, y_train) y_pred = est.predict(X_test) plt.plot(y_test, y_pred, ',k') plt.plot([0, 1], [0, 1], ':k') plt.xlim(0, 0.6) plt.ylim(0, 0.6) rms = np.sqrt(np.mean((y_test - y_pred) ** 2)) print(rms) from sklearn.ensemble import RandomForestClassifier RandomForestClassifier?