# Note: the directive above is used to prevent the code being executed during release.
# If you have downloaded the notebook for your own use, you can remove the directive,
# but this is not necessary (it is just a comment).
from datasets import load_dataset, ClassLabel
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectKBest, mutual_info_classif, chi2
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Normalizer
from sklearn.pipeline import FeatureUnion
from sklearn.metrics import confusion_matrix, classification_report
from textplumber.core import *
from textplumber.clean import *
from textplumber.preprocess import *
from textplumber.tokens import *
from textplumber.pos import *
from textplumber.embeddings import *
from textplumber.report import *
from textplumber.store import *
from textplumber.lexicons import *
from textplumber.textstats import *
from imblearn.under_sampling import RandomUnderSampler
Example Notebook
The intention of Textplumber is to make it easy to extract features from text data as part of a Sci-kit learn pipeline. It allows you to extract different kinds of features extracted from text, which you can combine as needed. This example demonstrates functionality using different datasets. If you are accessing this example from the documentation site, you can download the notebook from Github.
You can install Textplumber using pip …
pip install textplumber
1. Setup
These settings control the display of Pandas dataframes in the notebook.
'display.max_columns', None) # show all columns
pd.set_option('display.max_colwidth', 500) # increase this to see more text in the dataframe pd.set_option(
Get word lists: * The stop word list is from NLTK.
* All of the word lists (including the stop word list) can be used to extract lexicon count features to extract features based on a set of words.
= get_stop_words()
stop_words = {'stop_words': stop_words}
stop_words_lexicon = get_empath_lexicons()
empath_lexicons = get_sentiment_lexicons() vader_lexicons
2. Load and inspect data
2.1 Choose a dataset and preview the labels
Below you can select a dataset. The options are sentiment
, clickbait
, essay
and movie_reviews
. Change the value of dataset_option
below. The datasets (available on Huggingface) will be downloaded automatically and a link provided to the dataset card with more information.
= 'movie_reviews' # 'essay', 'sentiment', 'clickbait', 'essay, or 'movie_reviews'
dataset_option
if dataset_option == 'sentiment':
= 'cardiffnlp/tweet_eval'
dataset_name = 'sentiment'
dataset_dir = ['negative', 'neutral', 'positive']
target_labels = 'text'
text_column = 'label'
label_column = 'train'
train_split_name = 'validation'
test_split_name print('You selected the sentiment dataset. Read more about this at https://huggingface.co/datasets/cardiffnlp/tweet_eval')
elif dataset_option == 'clickbait':
= 'christinacdl/clickbait_detection_dataset'
dataset_name = None
dataset_dir = ['CLICKBAIT', 'NOT']
target_labels = 'text'
text_column = 'label'
label_column = 'train'
train_split_name = 'validation'
test_split_name print('You selected the clickbait dataset. Read more about this at https://huggingface.co/datasets/christinacdl/clickbait_detection_dataset')
elif dataset_option == 'essay':
= 'polsci/ghostbuster-essay-cleaned'
dataset_name = None
dataset_dir = ['claude', 'gpt', 'human']
target_labels = 'text'
text_column = 'label'
label_column = 'train'
train_split_name = 'test'
test_split_name print('You selected the essay dataset. Read more about this at https://huggingface.co/datasets/polsci/ghostbuster-essay-cleaned')
else:
= 'polsci/sentiment-polarity-dataset-v2.0'
dataset_name = None
dataset_dir = ['neg', 'pos']
target_labels = 'text'
text_column = 'label'
label_column = 'train'
train_split_name = 'train'
test_split_name print('You selected the movie_reviews dataset. Read more about this at https://huggingface.co/datasets/polsci/sentiment-polarity-dataset-v2.0')
You selected the movie_reviews dataset. Read more about this at https://huggingface.co/datasets/polsci/sentiment-polarity-dataset-v2.0
Make sure you go to the link above to read more about the selected dataset.
Important notes about specific datasets:
- For the sentiment dataset, it is challenging to get good accuracy with three classes. If you like you can remove the
neutral
class. There is a cell below that does this for you - don’t change the cell above. - For the essay dataset, there are differences in punctuation between classes. To avoid fitting to a quirk of the data, you can replace characters via the
TextCleaner
component like this:
TextCleaner(strip_whitespace=True, character_replacements = {"’": "'", '“': '"', '”': '"'})
This loads the dataset.
= load_dataset(dataset_name, data_dir=dataset_dir) dataset
This cell will show you information on the dataset fields and splits.
preview_dataset(dataset)
Split: train (2000 samples)
Available fields: text, label, fileid
- Field 'text' has 2000 unique values
Value(dtype='string', id=None)
- Field 'label' has 2 unique values
ClassLabel(names=['neg', 'pos'], id=None)
- Field 'fileid' has 2000 unique values
Value(dtype='string', id=None)
Notices
- Field 'text' appears to be a text column.
- Field 'label' is a label column (ClassLabel).
- Field 'fileid' appears to be a text column.
This cell will cast the label column to a ClassLabel type if it isn’t already.
cast_column_to_label(dataset, label_column)= get_label_names(dataset, label_column) label_names
Column 'label' is already a ClassLabel.
Here is the breakdown of the composition of labels in each split.
# Note: in future this example will be updated to use `preview_split_by_label_column`
= {}
dfs for split in dataset.keys():
= dataset[split].to_pandas()
dfs[split] 1, 'label_name', dfs[split][label_column].apply(lambda x: dataset[split].features[label_column].int2str(x)))
dfs[split].insert( preview_label_counts(dfs[split], label_column, label_names)
label_name | count | |
---|---|---|
label | ||
0 | neg | 1000 |
1 | pos | 1000 |
2.2 Configure the labels (optional)
- You can override the default labels for the data-set here to make the task more or less challenging. High accuracy does not guarantee a high grade.
- See the assignment instructions and the dataset card or corresponding paper for explanations of the data.
- Read the comments below and uncomment the relevant lines for your data-set if and amend the label names if needed.
- Remember, this is optional.
# for the movie reviews dataset (this is just for testing/demonstration) - there are 2 labels and that is it!
# for the sentiment dataset - there are 3 labels - you can make the task simpler as a binary classification problem using one of these options:
#target_labels = ['negative', 'neutral']
#target_labels = ['negative', 'positive']
#target_labels = ['neutral', 'positive']
# for the clickbait dataset there are only 2 labels - so it is already a binary classification problem
# for the essay dataset - there are 7 labels - you can make the task simpler as a binary classification problem using one of these options:
#target_labels = ['claude', 'gpt']
#target_labels = ['human', 'gpt']
#target_labels = ['human', 'claude']
# for the genre dataset ... TODO
print(target_labels)
['neg', 'pos']
2.3 Prepare the train and test splits
- This cell handles the train-test split for you.
- Some of the data-sets are unbalanced. This cell will balance the data-sets using under-sampling.
= [label_names.index(name) for name in target_labels]
target_classes = [label_names[i] for i in target_classes]
target_names
if train_split_name == test_split_name:
= dataset[train_split_name].to_pandas()
X 1, 'label_name', dfs[train_split_name][label_column].apply(lambda x: dataset[train_split_name].features[label_column].int2str(x)))
X.insert(= np.array(dataset[train_split_name][label_column])
y
= np.isin(y, target_classes)
mask = X.loc[mask]
X = y[mask]
y
# creating df splits with original data first - so can look at the train data if needed
'train'], dfs['test'], y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
dfs[
# we're just using the text for features
= np.array(dfs['train'][text_column])
X_train = np.array(dfs['test'][text_column])
X_test else:
= np.array(dataset[train_split_name][text_column])
X_train = np.array(dataset[train_split_name][label_column])
y_train = np.array(dataset[test_split_name][text_column])
X_test = np.array(dataset[test_split_name][label_column])
y_test
= np.isin(y_train, target_classes)
mask = np.isin(y_test, target_classes)
mask_test
= X_train[mask]
X_train = y_train[mask]
y_train = X_test[mask_test]
X_test = y_test[mask_test]
y_test
# this cell undersamples all but the minority class to balance the training data
= X_train.reshape(-1, 1)
X_train = RandomUnderSampler(random_state=0).fit_resample(X_train, y_train)
X_train, y_train = X_train.reshape(-1)
X_train
= target_classes, target_names = target_names) preview_splits(X_train, y_train, X_test, y_test, target_classes
Train: 1600 samples, 2 classes
label_name | count | |
---|---|---|
0 | ||
0 | neg | 800 |
1 | pos | 800 |
Test: 400 samples, 2 classes
label_name | count | |
---|---|---|
0 | ||
0 | neg | 200 |
1 | pos | 200 |
2.4 Preview the texts
Time to get to know your data. We will only preview the train split.
= map(lambda x: label_names[x], y_train)
y_train_names 'train'].sample(10)) display(dfs[
text | label_name | label | fileid | |
---|---|---|---|---|
3 | " quest for camelot " is warner bros . ' first feature-length , fully-animated attempt to steal clout from disney's cartoon empire , but the mouse has no reason to be worried . \nthe only other recent challenger to their throne was last fall's promising , if flawed , 20th century fox production " anastasia , " but disney's " hercules , " with its lively cast and colorful palate , had her beat hands-down when it came time to crown 1997's best piece of animation . \nthis year , it's no contes... | neg | 0 | cv003_12683.txt |
1475 | rated : r for strong language , sexual dialogue , drug use , crude humor , violence and brief nudity . \nstarring : ben affleck , matt damon , linda fiorentino , salma hayek , alan rickman , chris rock , kevin smith , jason mewes , jason lee , george carlin , alanis morissette . \nrunning time : 130 minutes \nbeing a huge fan of kevin smith , i was expecting a lot out of his newest project 'dogma' . \nit might just be kevin's best work to date . \nit's very funny with smart and foul-mouthed ... | pos | 1 | cv475_21692.txt |
1452 | if you're the type of person who goes on the submarine ride every time you visit disneyland , you're going to love the hunt for red october . \nyou'll also love the film if you enjoy cat and mouse military tactics , or if you're a sean connery or alec baldwin fan , or if you admired director john mctiernan's earlier films , die hard and predator . \nin fact , the only people likely to be disappointed with the hunt for red october are those who have read the book , since films almost never li... | pos | 1 | cv452_5088.txt |
704 | after 1993's " falling down , " i hoped that joel schumacher would mature into a great director . \nsince then he has offered us two so-so adaptations of john grisham novels ( " the client " and " a time to kill " ) and two batman movies that lowered the standards of that franchise . \nalthough these disappointments dampened my enthusiasm for schumacher's potential , the publicity for his latest release , " 8mm , " raised new hope . \nit promised to be something unusual . \nit wasn't . \nthe... | neg | 0 | cv704_17622.txt |
1651 | it might surprise some to know that joel and ethan coen , who have brought such unabated lunacy to our movie screens as " raising arizona " and " the hudsucker proxy , " made their feature film debut with " blood simple , " a grim and often gruesome tale of revenge , murder , and literally fatal misconceptions in rural texas . \nit bears some resemblance , story-wise , to the coens' recent " fargo , " but even the darkly satirical humor and the enjoyably quirky characterizations that charact... | pos | 1 | cv651_10492.txt |
1098 | meet joe black ( reviewed on nov . 27/98 ) \nstarring brad pitt , anthony hopkins , claire forlani \nin " meet joe black " , brad pitt plays death . \nthat's all that really needs to be said , but nevertheless , i will provide the three of you that have seemingly been living in a cave with a plot description . \ndeath decides to take a holiday , what with all the rigors of soul-collecting and all , and forces anthony hopkins into showing him what it's like to be human . \ndeath assumes the b... | pos | 1 | cv098_15435.txt |
1110 | plot : a bunch of bad guys dressed up as elvis impersonators rob a vegas casino during a presley convention . \nthe boys eventually get together to split the money , but as plans change , double-crosses occur , dealing and wheeling goes down and the crew set up for the road . \nwho's on the up and up , who's the real bad guy and who's gonna get to bang courteney cox are just a few of the questions which will be answered by the rest of this movie . \ncritique : the funnest movie that i've see... | pos | 1 | cv110_27788.txt |
827 | the best thing about , " lake placid " is that it's only 80 minutes long and when it's over you're glad that you didn't waste more than an hour and a half of your time . \nit's nothing more than a bad rip-off of , " jaws " ( and i think that's being kind . ) \nit was written by david e . kelly ( " ally mcbeal " ) as a horror-comedy but fails at both , miserably . \ni was never scared and i think that i only laughed once . \nthe crocodile even fails in comparison to the snake in , " anaconda ... | neg | 0 | cv827_19479.txt |
1859 | it is with hesitance that i call " apocalypse now " a masterpiece . \ncertainly , it had the pedigree to be one of the greatest films ever made , with a director known for producing masterpieces with ease , and some of the finest actors of the 1970's . \nthe plot , an adaptation of joseph conrad's " heart of darkness " , was set in vietnam , and the timing of the film was supposed to be brilliant , coming on the heels of the end of the war . \n " apocalypse " certainly has its moments , some... | pos | 1 | cv859_14107.txt |
1738 | here's a word analogy : amistad is to the lost world as schindler's list is to jurassic park . \nin 1993 , after steven spielberg made the monster dino hit , many critics described schindler's list as the director's " penance " ( as if there was a need for him to apologize for making a crowd-pleasing blockbuster ) . \nnow , after a three-year layoff , spielberg is back with a vengeance . \nonce again , his summer release was special effects-loaded action/adventure flick with dinosaurs munchi... | pos | 1 | cv738_10116.txt |
Enter the index (the number in the first column) as selected_index
to see the row. The limit
value controls how much of the text you see. Set a higher limit to see more of the text or set it to 0 to see all of the text.
# We can display the full text of a selected article by dataframe index
= 10
selected_index
'train'], selected_index, text_column = text_column, limit=400) # change limit to see more of the text if needed preview_row_text(dfs[
Value | |
---|---|
Attribute | |
label_name | neg |
label | 0 |
fileid | cv010_29063.txt |
text:
best remembered for his understated performance as dr . hannibal lecter in
michael mann's forensics thriller , manhunter , scottish character actor brian
cox brings something special to every movie he works on . usually playing a bit
role in some studio schlock ( he dies halfway through the long kiss goodnight )
, he's only occasionally given something meaty and substantial to do . if you
want t...
3. Create a classification pipeline and train a model
Create a Sci-kit Learn pipeline to preprocess the texts and train a classification model. The pipeline components will be added in through the notebook. There are a number of pipeline components you can access through the textplumber
package. You will have an opportunity to learn about this in labs, but documentation is available here.
To speed up preprocessing some of the pipeline components store the preprocessed data in a cache to avoid recomputing them. Run this as is - it will create an SQLite file with the name of your dataset option in the directory of the notebook. This will speed up some repeated processing (e.g. tokenization with Spacy).
= TextFeatureStore(f'example-{dataset_option}.sqlite') feature_store
The pipeline below includes a number of different components. Most are commented out on the first run of the notebook. There are lots of options for each component. You can look at the documentation to learn about these. These components can extract different kinds of features, any of which can be applied to build a model. The potential feature types include:
- Token features
- Bigram features
- Parts of speech features
- Lexicon-based features
- Document-level statistics
- Text embeddings
# you can uncomment components below to create a more complex pipeline
= Pipeline([
pipeline 'cleaner', TextCleaner(strip_whitespace=True)), # for the essay dataset you should use character_replacements = {"’": "'", '“': '"', '”': '"',}
('spacy', SpacyPreprocessor(feature_store=feature_store)),
('features', FeatureUnion([
('tokens', # token features - these can be single tokens or ngrams of tokens using TokensVectorizer - see textplumber documentation for examples
(
Pipeline(['spacy_token_vectorizer', TokensVectorizer(feature_store = feature_store, vectorizer_type='count', max_features=100, lowercase = True, remove_punctuation = True, stop_words = stop_words, min_df=0.0, max_df=1.0, ngram_range=(1, 1))),
(# ('selector', SelectKBest(score_func=mutual_info_classif, k=100)), # uncomment for feature selection
# ('scaler', StandardScaler(with_mean=False)),
= True)),
], verbose
# ('pos', # pos features - these can be a single label or ngrams of pos tags using POSVectorizer - see textplumber documentation for examples
# Pipeline([
# ('spacy_pos_vectorizer', POSVectorizer(feature_store=feature_store)),
# #('selector', SelectKBest(score_func=mutual_info_classif, k=5)),
# ('scaler', StandardScaler(with_mean=False)),
# ], verbose = True)),
# ('textstats', # document-level text statistics using TextstatsTransformer - see textplumber documentation for examples
# Pipeline([
# ('textstats_vectorizer', TextstatsTransformer(feature_store=feature_store)),
# ('scaler', StandardScaler(with_mean=False)),
# ], verbose = True)),
# ('lexicon', # lexicon features - defined above are empath_lexicons, sentiment_lexicons and stop_words_lexicon - see textplumber documentation for examples
# Pipeline([
# ('lexicon_vectorizer', LexiconCountVectorizer(feature_store=feature_store, lexicons=empath_lexicons)), # the notebook has already provided example lexicons right at the top!
# #('selector', SelectKBest(score_func=mutual_info_classif, k=5)),
# ('scaler', StandardScaler(with_mean=False)),
# ], verbose = True)),
# ('embeddings', Model2VecEmbedder(feature_store=feature_store)), # extract embeddings using Model2Vec - textplumber documentation for examples
= True)),
], verbose
'classifier', LogisticRegression(max_iter=5000, random_state=42)) # for logistic regression - only select one classifier!
(# ('classifier', DecisionTreeClassifier(max_depth = 3, random_state=42)) # for decision tree - only select one classifier!
= True) # using verbose because I like to see what is going on
], verbose
display(pipeline)
Pipeline(steps=[('cleaner', TextCleaner(strip_whitespace=True)), ('spacy', SpacyPreprocessor(feature_store=<textplumber.store.TextFeatureStore object at 0x7f3dbe976890>)), ('features', FeatureUnion(transformer_list=[('tokens', Pipeline(steps=[('spacy_token_vectorizer', TokensVectorizer(feature_store=<textplumber.store.TextFeatureStore object at 0x7f3dbe976890>,... remove_punctuation=True, stop_words=["'d", "'ll", "'m", "'re", "'s", "'ve", 'a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', ...]))], verbose=True))], verbose=True)), ('classifier', LogisticRegression(max_iter=5000, random_state=42))], verbose=True)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('cleaner', TextCleaner(strip_whitespace=True)), ('spacy', SpacyPreprocessor(feature_store=<textplumber.store.TextFeatureStore object at 0x7f3dbe976890>)), ('features', FeatureUnion(transformer_list=[('tokens', Pipeline(steps=[('spacy_token_vectorizer', TokensVectorizer(feature_store=<textplumber.store.TextFeatureStore object at 0x7f3dbe976890>,... remove_punctuation=True, stop_words=["'d", "'ll", "'m", "'re", "'s", "'ve", 'a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', ...]))], verbose=True))], verbose=True)), ('classifier', LogisticRegression(max_iter=5000, random_state=42))], verbose=True)
TextCleaner(strip_whitespace=True)
SpacyPreprocessor(feature_store=<textplumber.store.TextFeatureStore object at 0x7f3dbe976890>)
FeatureUnion(transformer_list=[('tokens', Pipeline(steps=[('spacy_token_vectorizer', TokensVectorizer(feature_store=<textplumber.store.TextFeatureStore object at 0x7f3dbe976890>, lowercase=True, max_features=100, min_df=0.0, remove_punctuation=True, stop_words=["'d", "'ll", "'m", "'re", "'s", "'ve", 'a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', ...]))], verbose=True))], verbose=True)
TokensVectorizer(feature_store=<textplumber.store.TextFeatureStore object at 0x7f3dbe976890>, lowercase=True, max_features=100, min_df=0.0, remove_punctuation=True, stop_words=["'d", "'ll", "'m", "'re", "'s", "'ve", 'a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', ...])
LogisticRegression(max_iter=5000, random_state=42)
Note: the preprocessing stage will be slow the first time you run this cell, but the preprocessed features will be loaded from the feature store on subsequent training using the same training data.
pipeline.fit(X_train, y_train)
[Pipeline] ........... (step 1 of 4) Processing cleaner, total= 0.0s
[Pipeline] ............. (step 2 of 4) Processing spacy, total= 0.1s
/home/geoff/miniconda3/envs/textplumber/lib/python3.11/site-packages/sklearn/feature_extraction/text.py:402: UserWarning: Your stop_words may be inconsistent with your preprocessing. Tokenizing the stop words generated tokens ["'", 'b', 'c', 'e', 'f', 'g', 'h', 'j', 'l', 'n', 'p', 'r', 'u', 'v', 'w'] not in stop_words.
warnings.warn(
[Pipeline] (step 1 of 1) Processing spacy_token_vectorizer, total= 0.8s
[FeatureUnion] ........ (step 1 of 1) Processing tokens, total= 0.8s
[Pipeline] .......... (step 3 of 4) Processing features, total= 0.8s
[Pipeline] ........ (step 4 of 4) Processing classifier, total= 0.0s
Pipeline(steps=[('cleaner', TextCleaner(strip_whitespace=True)), ('spacy', SpacyPreprocessor(feature_store=<textplumber.store.TextFeatureStore object at 0x7f3dbe976890>)), ('features', FeatureUnion(transformer_list=[('tokens', Pipeline(steps=[('spacy_token_vectorizer', TokensVectorizer(feature_store=<textplumber.store.TextFeatureStore object at 0x7f3dbe976890>,... remove_punctuation=True, stop_words=["'d", "'ll", "'m", "'re", "'s", "'ve", 'a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', ...]))], verbose=True))], verbose=True)), ('classifier', LogisticRegression(max_iter=5000, random_state=42))], verbose=True)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('cleaner', TextCleaner(strip_whitespace=True)), ('spacy', SpacyPreprocessor(feature_store=<textplumber.store.TextFeatureStore object at 0x7f3dbe976890>)), ('features', FeatureUnion(transformer_list=[('tokens', Pipeline(steps=[('spacy_token_vectorizer', TokensVectorizer(feature_store=<textplumber.store.TextFeatureStore object at 0x7f3dbe976890>,... remove_punctuation=True, stop_words=["'d", "'ll", "'m", "'re", "'s", "'ve", 'a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', ...]))], verbose=True))], verbose=True)), ('classifier', LogisticRegression(max_iter=5000, random_state=42))], verbose=True)
TextCleaner(strip_whitespace=True)
SpacyPreprocessor(feature_store=<textplumber.store.TextFeatureStore object at 0x7f3dbe976890>)
FeatureUnion(transformer_list=[('tokens', Pipeline(steps=[('spacy_token_vectorizer', TokensVectorizer(feature_store=<textplumber.store.TextFeatureStore object at 0x7f3dbe976890>, lowercase=True, max_features=100, min_df=0.0, remove_punctuation=True, stop_words=["'d", "'ll", "'m", "'re", "'s", "'ve", 'a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', ...]))], verbose=True))], verbose=True)
TokensVectorizer(feature_store=<textplumber.store.TextFeatureStore object at 0x7f3dbe976890>, lowercase=True, max_features=100, min_df=0.0, remove_punctuation=True, stop_words=["'d", "'ll", "'m", "'re", "'s", "'ve", 'a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', ...])
LogisticRegression(max_iter=5000, random_state=42)
Run the predictions and output model metrics and a confusion matrix using this cell.
= pipeline.predict(X_test) y_predicted
print(classification_report(y_test, y_predicted, target_names = target_names, digits=3))
plot_confusion_matrix(y_test, y_predicted, target_classes, target_names)
precision recall f1-score support
neg 0.708 0.690 0.699 200
pos 0.698 0.715 0.706 200
accuracy 0.703 400
macro avg 0.703 0.702 0.702 400
weighted avg 0.703 0.703 0.702 400
The cell below is commented out, but you have the option to uncomment it to run a grid search based on the pipeline you’ve created above.
# # Note: if you get a warning about tokenizers and parallelism - uncomment this line
# # os.environ["TOKENIZERS_PARALLELISM"] = "false"
#
# # setup gridsearch to test different max_features
# from sklearn.model_selection import GridSearchCV
# param_grid = {
# 'features__tokens__spacy_token_vectorizer__max_features': [50, 100, 150, 200, 250, 300], # this assumes you are using the tokens part of the pipeline
# # 'features__tokens__selector__k': [50, 100, 150, 200, 250, 300], # this assumes you have enabled the selector for tokens
# }
# grid_search = GridSearchCV(pipeline, param_grid, cv=3, scoring='f1_macro', verbose=100, n_jobs=1)
# grid_search.fit(X_train, y_train)
# print('\n-----------------------------------------------------------------')
# print("Best parameters found: ", grid_search.best_params_)
# print("Best score found: ", grid_search.best_score_)
# print('-----------------------------------------------------------------\n')
# y_pred = grid_search.predict(X_test)
# print(classification_report(y_test, y_pred, target_names = target_names, digits=3))
# plot_confusion_matrix(y_test, y_pred, target_classes, target_names)
4. Evaluate your model and investigate model predictions
You already have some metrics in the cell above. Below is some additional reporting to help you understand your model.
4.1 Understand feature extraction and selection
Use preview_pipeline_features
to examine the features being extracted and selected by the steps of your pipeline. The final classifier step will show the features input for training the classifier.
preview_pipeline_features(pipeline)
cleaner TextCleaner
This step receives and returns text.
spacy SpacyPreprocessor
This step receives and returns text.
features FeatureUnion
spacy_token_vectorizer TokensVectorizer
Features Out (100)
action, actually, almost, also, although, another, around, audience, back, bad, best, better, big, cast, character, characters, come, comedy, comes, director, end, enough, even, ever, every, fact, film, films, find, first, funny, get, gets, go, going, good, great, however, john, know, last, life, like, little, long, look, love, made, make, makes, man, many, may, movie, movies, much, never, new, nothing, old, one, people, performance, played, plays, plot, real, really, right, role, say, scene, scenes, script, see, seems, seen, show, since, something, star, still, story, take, thing, things, think, though, three, time, two, us, way, well, without, work, world, year, years, young
classifier LogisticRegression
Features In (100)
tokens__action, tokens__actually, tokens__almost, tokens__also, tokens__although, tokens__another, tokens__around, tokens__audience, tokens__back, tokens__bad, tokens__best, tokens__better, tokens__big, tokens__cast, tokens__character, tokens__characters, tokens__come, tokens__comedy, tokens__comes, tokens__director, tokens__end, tokens__enough, tokens__even, tokens__ever, tokens__every, tokens__fact, tokens__film, tokens__films, tokens__find, tokens__first, tokens__funny, tokens__get, tokens__gets, tokens__go, tokens__going, tokens__good, tokens__great, tokens__however, tokens__john, tokens__know, tokens__last, tokens__life, tokens__like, tokens__little, tokens__long, tokens__look, tokens__love, tokens__made, tokens__make, tokens__makes, tokens__man, tokens__many, tokens__may, tokens__movie, tokens__movies, tokens__much, tokens__never, tokens__new, tokens__nothing, tokens__old, tokens__one, tokens__people, tokens__performance, tokens__played, tokens__plays, tokens__plot, tokens__real, tokens__really, tokens__right, tokens__role, tokens__say, tokens__scene, tokens__scenes, tokens__script, tokens__see, tokens__seems, tokens__seen, tokens__show, tokens__since, tokens__something, tokens__star, tokens__still, tokens__story, tokens__take, tokens__thing, tokens__things, tokens__think, tokens__though, tokens__three, tokens__time, tokens__two, tokens__us, tokens__way, tokens__well, tokens__without, tokens__work, tokens__world, tokens__year, tokens__years, tokens__young
4.2 Classifier-specific features
If you are using a Decision Tree classifier in your pipeline, this will plot it …
if pipeline.named_steps['classifier'].__class__.__name__ == 'DecisionTreeClassifier':
'classifier', 'features')
plot_decision_tree_from_pipeline(pipeline, X_train, y_train, target_classes, target_names, else:
print('The classifier is not a decision tree - so no plot is shown!')
The classifier is not a decision tree - so no plot is shown!
If you are using a Logistic Regression classifier in your pipeline, this will plot the coefficients of the features in the model.
if pipeline.named_steps['classifier'].__class__.__name__ == 'LogisticRegression':
=20, classifier_step_name = 'classifier', features_step_name = 'features')
plot_logistic_regression_features_from_pipeline(pipeline, target_classes, target_names, top_nelse:
print('The classifier is not a logistic regression - so no plot is shown!')
Feature | Log Odds (Logit) | Odds Ratio | |
---|---|---|---|
9 | tokens__bad | -0.663000 | 0.515303 |
73 | tokens__script | -0.488679 | 0.613436 |
11 | tokens__better | -0.451887 | 0.636426 |
36 | tokens__great | 0.420907 | 1.523343 |
58 | tokens__nothing | -0.394111 | 0.674279 |
65 | tokens__plot | -0.361455 | 0.696662 |
4 | tokens__although | 0.325478 | 1.384693 |
93 | tokens__well | 0.290438 | 1.337013 |
62 | tokens__performance | 0.282515 | 1.326462 |
76 | tokens__seen | 0.281643 | 1.325305 |
10 | tokens__best | 0.275137 | 1.316711 |
1 | tokens__actually | -0.259407 | 0.771509 |
96 | tokens__world | 0.257455 | 1.293633 |
51 | tokens__many | 0.239504 | 1.270619 |
47 | tokens__made | -0.233977 | 0.791380 |
68 | tokens__right | 0.226679 | 1.254427 |
19 | tokens__director | -0.214460 | 0.806977 |
94 | tokens__without | 0.210763 | 1.234619 |
3 | tokens__also | 0.206651 | 1.229554 |
23 | tokens__ever | 0.189905 | 1.209135 |
4.3 Investigate correct and incorrect predictions
To see the predictions of your model run this cell. The output can be quite long depending on the dataset and the number of misclassifications. The Pandas max_rows
is configured at the top of the cell to restrict the length of output. You can adjust this as required. This is reset back to the Pandas default at the end of the cell.
# adjust max rows
'display.max_rows', 5) # show all rows
pd.set_option(
# creating dataframe from y_predicted, y_test and the text
= pd.DataFrame(data = {'true': y_test, 'predicted': y_predicted})
predictions_df = pipeline.predict_proba(X_test)
y_predicted_probs = np.round(y_predicted_probs, 3)
y_predicted_probs = [f'{target_names[i]}_prob' for i in range(len(target_names))]
columns 'predicted'] = predictions_df['predicted'].apply(lambda x: label_names[x])
predictions_df['true'] = predictions_df['true'].apply(lambda x: label_names[x])
predictions_df['correct'] = predictions_df['true'] == predictions_df['predicted']
predictions_df['text'] = X_test
predictions_df[= pd.concat([predictions_df, pd.DataFrame(y_predicted_probs, columns=columns)], axis=1)
predictions_df
# output a preview of docs for each cell of confusion matrix ...
for true_target, target_name in enumerate(target_names):
for predicted_target, target_name in enumerate(target_names):
if true_target == predicted_target:
print(f'\nCORRECTLY CLASSIFIED: {target_names[true_target]}')
else:
print(f'\n{target_names[true_target]} INCORRECTLY CLASSIFIED as: {target_names[predicted_target]}')
print('=================================================================')
'true'] == target_names[true_target]) & (predictions_df['predicted'] == target_names[predicted_target])])
display(predictions_df[(predictions_df[
'display.max_rows', 60) # setting back to the default pd.set_option(
CORRECTLY CLASSIFIED: neg
=================================================================
true | predicted | correct | text | neg_prob | pos_prob | |
---|---|---|---|---|---|---|
0 | neg | neg | True | dr dolittle ( 20th century fox ) running time : 1 hour 25 minutes starring eddie murphy directed by betty thomas riding high on the success of the nutty professor ( 1996 ) , murphy returns in this abysmal comedy . \nhe plays doctor john dolittle , who as a child had the ability to understand animals . \nhowever , after being 'exorcised' he loses this ability , and we fast forward to see dolittle in a crummy job surrounding by crummy people ( most notably dr mark weller , played by oliver pla... | 0.883 | 0.117 |
1 | neg | neg | True | " spawn " features good guys , bad guys , lots of fighting , bloody violence , a leather-clad machine gun chick , gooey , self-healing bullet holes , scatological humor and a man-eating monster . \nit not only appears to have been tailor made for a swarm of 12- and 13-year-old boys , it appears to have been made by them . \nin a classic example of telling and not showing , " spawn " opens with a truckload of mumbo jumbo about forces of darkness , forces of light and how " men are the ones w... | 0.806 | 0.194 |
... | ... | ... | ... | ... | ... | ... |
396 | neg | neg | True | it's difficult to expect much from a director whose greatest accomplishments to date are a handful of " award-winning " tv commercials , as is the case with bubble boy director blair hayes . \nthat said , hayes's feature film debut lives up to expectations , coming off mainly as equal parts offensive and moronic . \nbut occasionally , bubble boy transcends its substandard roots with glimmers of humor and scathing social commentary . \nthose moments of intelligence are delivered mostly by the... | 0.515 | 0.485 |
397 | neg | neg | True | whether you like the beatles or not , nobody wants to see the bee gee's take on some of the fab four's best known songs . \nwell , maybe that's not true . \n . \n . \nmaybe you're curious , the way you have to look in your hanky after you blow your nose . \nyou just have to know how bad bad can be . \nif that's the case , rejoice , because it was twenty years ago today ( or so ) that sgt . \npepper's lonely hearts club band was released ( unleashed ? ) to the world , and thanks to our modern... | 0.727 | 0.273 |
138 rows × 6 columns
neg INCORRECTLY CLASSIFIED as: pos
=================================================================
true | predicted | correct | text | neg_prob | pos_prob | |
---|---|---|---|---|---|---|
10 | neg | pos | False | in the line of duty is the critically praised series of television movies dealing with the real-life incidents that claimed lives of law enforcement officers in usa . \nthe twilight murders , another one from the series , is dealing with the case of gordon kahl ( played by rod steiger ) , old farmer from north dakota who would rather spend a year in prison than pay taxes to the despised u . s . government . \nafter being released , he still refuses to pay taxes and the warrant is issued for ... | 0.152 | 0.848 |
14 | neg | pos | False | according to popular film opinion , a film's greatness is determined by time . \ntake for example " casablanca . " \ngreat film , even today . \nit's still as powerful as it was when it came out and still as romantic and tragic . \nanother example would be " star wars , " which had a very , very healthy box office gross despite the fact that we had all seen it about 3 billion times before . \nbut as i rewatched " independence day " when it came out on video after being the number one hit of ... | 0.189 | 0.811 |
... | ... | ... | ... | ... | ... | ... |
388 | neg | pos | False | spoiled rich kid kelley morse ( chris klein ) receives a new mercedes for a graduation present . \nhe and his buddies take it for a joyride to a small nearby town , where he proceeds to torment the locals simply because he's rich and they're not . \nhe ends up provoking jasper ( josh hartnett ) into a race and as a result , the local gas station and diner are destroyed when they crash into it . \nkelley is sentenced to rebuild the diner , and has to live with jasper in a spare room over his ... | 0.207 | 0.793 |
398 | neg | pos | False | you think that these people only exist in the movies , but trust me , they're as real as life . \ni once talked to a guy who thought the united states government was putting satellites into orbit which could fry an individual person's brain with microwaves . \nthen i sat in a room full of people who believed that the government rigged state elections . \ni even listened to a man who swore that nicotine was an additive that cigarette companies put in their products for the specific goal of ge... | 0.492 | 0.508 |
62 rows × 6 columns
pos INCORRECTLY CLASSIFIED as: neg
=================================================================
true | predicted | correct | text | neg_prob | pos_prob | |
---|---|---|---|---|---|---|
13 | pos | neg | False | some of my friends who went to live in usa complain about one thing - that country is very different from the one depicted in hollywood movies . \nthat is especially true for those who end up somewhere in that unexplored land between los angeles and new york where they find , to their big surprise , that the majority of people vote republican , go to church every sunday and usually don't tolerate liberal attitudes that are taken for granted in an average american film . \nsuch rude awakening... | 0.896 | 0.104 |
22 | pos | neg | False | " the blair witch project " was perhaps one of a kind , a unique film that played completely on its own merit , managing to scare even the most experienced horror fans out of their senses . \nits success made a sequel inevitable , but this is not the sequel , i suspect , anyone much wanted . \nafter the release of " the blair witch project " , tourists have practically invaded the small town of burkettsville , in order to get a glimpse of the blair witch . \nlocals have turned this mass hys... | 0.661 | 0.339 |
... | ... | ... | ... | ... | ... | ... |
393 | pos | neg | False | in the wake of the smashing success of " rumble in the bronx , " it's looking more and more likely that more jackie chan films will see american release . \nrumor has it that one of these films will be drunken master ii . \nthe version i have is a copy from the laserdisc ; it's widescreen and bilingually-subtitled , as are most hong kong films these days . \navailability over here in the united states is very limited ; these films must either be purchased via pirates or sought out from asian... | 0.862 | 0.138 |
394 | pos | neg | False | there exists a litany of differences between a successful action movie and a successful suspense movie . \naction movies are typically devoid of plot other than a simple byline which can string together several explosive sequences , while suspense movies hinge on plot and subtlety and the ability to bring everything full-circle . \nfor fans of both genres , however , realism is key . \naction fans want to know that the weapons and methods their heroes are using are authentic , and suspense f... | 0.863 | 0.137 |
57 rows × 6 columns
CORRECTLY CLASSIFIED: pos
=================================================================
true | predicted | correct | text | neg_prob | pos_prob | |
---|---|---|---|---|---|---|
2 | pos | pos | True | the keen wisdom of an elderly bank robber , the naive ambitions of a sexy hospital nurse , and a partnership that blossoms between the two are the fine components that make up a modest , little caper adventure entitled `where the money is . ' \nthe elderly bank robber is henry ( paul newman ) , a famous criminal that was only recently caught . \nhe has pulled off dozens of successful heists and has probably stashed away a small fortune . \nalways the shrewd thinker , he begins working on a p... | 0.126 | 0.874 |
5 | pos | pos | True | one of the sweetest tales to ever be made , it's a wonderful life isn't perfect , but its good natured charm and beautiful performances light up the screen with glorious results . \nprobably the greatest " feel-good " film of all time , it's a wonderful life aims for the heart , and strikes with a golden arrow . \non christmas eve , george bailey ( stewart ) is being prayed for by many in the small town of bedford falls . \nyou see , george is in trouble , and he has always helped others who... | 0.021 | 0.979 |
... | ... | ... | ... | ... | ... | ... |
395 | pos | pos | True | if you had a chance to create a genetically perfect child , would you do it ? \n " gattaca " is a film which presents a future where society has answered " yes " to this question , but then ponders if this was actually the right decision . \n " gattaca " came out only a couple months following the first genetically engineered creature , the lovable dolly the sheep , and with this in mind , the film only becomes more frightening . \nthe way the realm of genetics is heading , it may only take ... | 0.024 | 0.976 |
399 | pos | pos | True | the postman delivers but not first class \nthe postman a film review by michael redman copyright 1997 by michael redman \n[warning : my opinion of this film is definitely in the minority of reviewers perhaps because it hits so many of my cinematic buttons : post-apocalypse stories , hope in a desperate situation , grassroots uprisings and kevin costner . \neven worse , i thought that " waterworld " was watchable . \nread the following with those particular grains of salt . ] \nit's the year ... | 0.266 | 0.734 |
143 rows × 6 columns
4.4 Run inference on new (or old) data
You can also run inference on new data (or any of the texts from training/validation) by changing the contents of the texts
list below. This outputs a prediction, the probabilities of each class and the features present within the text that are used by the model to make its predictions. The numbers for each feature are the input to the final step of the pipeline. They may be scaled or transformed depending on the pipeline components you’ve chosen.
= ['''
texts It was excellent!
''',
'''
This was a terrible movie!
''',
'''
This might not not be the best movie ever made, or it could be the best movie of no time.
''',
]
= pipeline.predict(texts)
y_inference
= Pipeline(pipeline.steps[:-1])
preprocessor = preprocessor.named_steps['features'].get_feature_names_out()
feature_names
for i, text in enumerate(texts):
print(f"Text {i}: {text}")
print(f"\tPredicted class: {label_names[y_inference[i]]}")
print()
= pipeline.predict_proba([text])
y_inference_proba for i, prob in enumerate(y_inference_proba[0]):
print(f"\tProbability of class {target_names[i]}: {prob:.2f}")
print()
print("\tFeatures:")
= 0
embeddings = preprocessor.transform([text])
frequencies if not isinstance(frequencies, np.ndarray):
= frequencies.toarray()
frequencies = frequencies[0].T
frequencies for j, freq in enumerate(frequencies):
if feature_names[j].startswith('embeddings_'):
+= 1
embeddings elif freq > 0:
print(f"\t{feature_names[j]}: {freq:.2f}")
if embeddings > 0:
print(f"\tFeatures also include {embeddings} embedding dimensions")
print()
Text 0:
It was excellent!
Predicted class: neg
Probability of class neg: 0.56
Probability of class pos: 0.44
Features:
Text 1:
This was a terrible movie!
Predicted class: neg
Probability of class neg: 0.58
Probability of class pos: 0.42
Features:
tokens__movie: 1.00
Text 2:
This might not not be the best movie ever made, or it could be the best movie of no time.
Predicted class: pos
Probability of class neg: 0.47
Probability of class pos: 0.53
Features:
tokens__best: 2.00
tokens__ever: 1.00
tokens__made: 1.00
tokens__movie: 2.00
tokens__time: 1.00