#!/usr/bin/env python # coding: utf-8 # In[1]: get_ipython().run_line_magic('reload_ext', 'autoreload') get_ipython().run_line_magic('autoreload', '2') get_ipython().run_line_magic('matplotlib', 'inline') from fastai.nlp import * from sklearn.linear_model import LogisticRegression # ## IMDB dataset and the sentiment classification task # The [large movie review dataset](http://ai.stanford.edu/~amaas/data/sentiment/) contains a collection of 50,000 reviews from IMDB. The dataset contains an even number of positive and negative reviews. The authors considered only highly polarized reviews. A negative review has a score ≤ 4 out of 10, and a positive review has a score ≥ 7 out of 10. Neutral reviews are not included in the dataset. The dataset is divided into training and test sets. The training set is the same 25,000 labeled reviews. # # The **sentiment classification task** consists of predicting the polarity (positive or negative) of a given text. # # To get the dataset, in your terminal run the following commands: # # `wget http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz` # # `gunzip aclImdb_v1.tar.gz` # # `tar -xvf aclImdb_v1.tar` # ### Tokenizing and term document matrix creation # In[2]: PATH='data/aclImdb/' names = ['neg','pos'] # In[3]: get_ipython().run_line_magic('ls', '{PATH}') # In[22]: get_ipython().run_line_magic('ls', '{PATH}train') # In[23]: get_ipython().run_line_magic('ls', '{PATH}train/pos | head') # In[24]: trn,trn_y = texts_labels_from_folders(f'{PATH}train',names) val,val_y = texts_labels_from_folders(f'{PATH}test',names) # Here is the text of the first review # In[25]: trn[0] # In[26]: trn_y[0] # [`CountVectorizer`](http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html) converts a collection of text documents to a matrix of token counts (part of `sklearn.feature_extraction.text`). # In[27]: veczr = CountVectorizer(tokenizer=tokenize) # `fit_transform(trn)` finds the vocabulary in the training set. It also transforms the training set into a term-document matrix. Since we have to apply the *same transformation* to your validation set, the second line uses just the method `transform(val)`. `trn_term_doc` and `val_term_doc` are sparse matrices. `trn_term_doc[i]` represents training document i and it contains a count of words for each document for each word in the vocabulary. # In[28]: trn_term_doc = veczr.fit_transform(trn) val_term_doc = veczr.transform(val) # In[29]: trn_term_doc # In[30]: trn_term_doc[0] # In[31]: vocab = veczr.get_feature_names(); vocab[5000:5005] # In[32]: w0 = set([o.lower() for o in trn[0].split(' ')]); w0 # In[33]: len(w0) # In[34]: veczr.vocabulary_['absurd'] # In[35]: trn_term_doc[0,1297] # In[36]: trn_term_doc[0,5000] # ## Naive Bayes # We define the **log-count ratio** $r$ for each word $f$: # # $r = \log \frac{\text{ratio of feature $f$ in positive documents}}{\text{ratio of feature $f$ in negative documents}}$ # # where ratio of feature $f$ in positive documents is the number of times a positive document has a feature divided by the number of positive documents. # In[89]: def pr(y_i): p = x[y==y_i].sum(0) return (p+1) / ((y==y_i).sum()+1) # In[90]: x=trn_term_doc y=trn_y r = np.log(pr(1)/pr(0)) b = np.log((y==1).mean() / (y==0).mean()) # Here is the formula for Naive Bayes. # In[84]: pre_preds = val_term_doc @ r.T + b preds = pre_preds.T>0 (preds==val_y).mean() # ...and binarized Naive Bayes. # In[91]: x=trn_term_doc.sign() r = np.log(pr(1)/pr(0)) pre_preds = val_term_doc.sign() @ r.T + b preds = pre_preds.T>0 (preds==val_y).mean() # ### Logistic regression # Here is how we can fit logistic regression where the features are the unigrams. # In[22]: m = LogisticRegression(C=1e8, dual=True) m.fit(x, y) preds = m.predict(val_term_doc) (preds==val_y).mean() # In[23]: m = LogisticRegression(C=1e8, dual=True) m.fit(trn_term_doc.sign(), y) preds = m.predict(val_term_doc.sign()) (preds==val_y).mean() # ...and the regularized version # In[24]: m = LogisticRegression(C=0.1, dual=True) m.fit(x, y) preds = m.predict(val_term_doc) (preds==val_y).mean() # In[25]: m = LogisticRegression(C=0.1, dual=True) m.fit(trn_term_doc.sign(), y) preds = m.predict(val_term_doc.sign()) (preds==val_y).mean() # ### Trigram with NB features # Our next model is a version of logistic regression with Naive Bayes features described [here](https://www.aclweb.org/anthology/P12-2018). For every document we compute binarized features as described above, but this time we use bigrams and trigrams too. Each feature is a log-count ratio. A logistic regression model is then trained to predict sentiment. # In[9]: veczr = CountVectorizer(ngram_range=(1,3), tokenizer=tokenize, max_features=800000) trn_term_doc = veczr.fit_transform(trn) val_term_doc = veczr.transform(val) # In[10]: trn_term_doc.shape # In[11]: vocab = veczr.get_feature_names() # In[12]: vocab[200000:200005] # In[13]: y=trn_y x=trn_term_doc.sign() val_x = val_term_doc.sign() # In[16]: r = np.log(pr(1) / pr(0)) b = np.log((y==1).mean() / (y==0).mean()) # Here we fit regularized logistic regression where the features are the trigrams. # In[42]: m = LogisticRegression(C=0.1, dual=True) m.fit(x, y); preds = m.predict(val_x) (preds.T==val_y).mean() # Here is the $\text{log-count ratio}$ `r`. # In[43]: r.shape, r # In[44]: np.exp(r) # Here we fit regularized logistic regression where the features are the trigrams' log-count ratios. # In[45]: x_nb = x.multiply(r) m = LogisticRegression(dual=True, C=0.1) m.fit(x_nb, y); val_x_nb = val_x.multiply(r) preds = m.predict(val_x_nb) (preds.T==val_y).mean() # ## fastai NBSVM++ # In[17]: sl=2000 # In[18]: # Here is how we get a model from a bag of words md = TextClassifierData.from_bow(trn_term_doc, trn_y, val_term_doc, val_y, sl) # In[19]: learner = md.dotprod_nb_learner() learner.fit(0.02, 1, wds=1e-6, cycle_len=1) # In[159]: learner.fit(0.02, 2, wds=1e-6, cycle_len=1) # In[160]: learner.fit(0.02, 2, wds=1e-6, cycle_len=1) # ## References # * Baselines and Bigrams: Simple, Good Sentiment and Topic Classification. Sida Wang and Christopher D. Manning [pdf](https://www.aclweb.org/anthology/P12-2018) # In[ ]: