Source code for text_classification.featurizer.tweet_featurizer

from multiprocessing import cpu_count
from collections import Counter
from statistics import mean
import logging
import itertools

import spacy
from spacymoji import Emoji

from text_classification.featurizer.base import BaseFeaturizer

logger = logging.getLogger(__name__)


[docs]class TweetFeaturizer(BaseFeaturizer):
    """
    Featurizer that extracts features from tweets, i.e. it doesn't
    contain any paragraph-based features as these don't apply for
    tweets.
    """

[docs]    def __init__(self, lang_model="en_core_web_sm", normalize=True):
        """
        Instantiates a TweetFeaturizer instance.

        :param lang_model: A spaCy language model name.
        :type lang_model: str
        :param normalize: Whether to normalize the features based on
            number of chars/tokens.
        :type normalize: bool
        """

        try:
            # initialize spacy model
            self.spacy_model = spacy.load(lang_model, disable=["parser"])
            emoji_detector = Emoji(self.spacy_model)
            self.spacy_model.add_pipe(emoji_detector, first=True)
        except OSError:
            # user inserted an unknown model name
            raise ModuleNotFoundError(
                f"Language model '{lang_model}' not installed.\n"
                f"\tTo install the model, execute: 'python -m spacy download "
                f"{lang_model}'\n"
                f"\tAvailable models can be found at: "
                f"https://spacy.io/usage/models"
            )

        self.normalize = normalize
        self.feature_functions = [
            self._char_based_features,
            self._word_based_features,
            self._pos_features,
            self._ner_features,
        ]

[docs]    def add_feature(self, feature_extraction_function):
        """
        Adds a custom feature extraction function to the predefined
        ones. The feature extraction function must take as input a
        dictionary containing a key 'text' and return a dict with
        with 'feature_names' and 'feature_vector' as keys.

        :param feature_extraction_function: Custom function that
            extracts features from text.
        :type feature_extraction_function: function
        """
        self.feature_functions.append(feature_extraction_function)

[docs]    def extract_features(self, preprocessor, exclude=set()):
        """
        Extracts the features for all splits in the preprocessor and
        adds feature vector and feature name for each instance in-place.

        :param preprocessor: Preprocessor containing samples to featurize.
        :type preprocessor: BasePreprocessor
        :param exclude: Set of features that should be excluded from
            resulting feature vectors.
        :param exclude: Set[str]
        """
        logger.info("Extracting features...")

        data_splits = preprocessor.get_data()

        for split in data_splits:
            # get annotations from spacy
            self._add_spacy_annotations(split)

            # extract features for each instance
            for instance in split:
                instance["feature_vector"] = []
                instance["feature_names"] = []
                for function in self.feature_functions:
                    count_dict = function(instance, exclude)
                    if count_dict is not None:
                        instance["feature_vector"] += list(count_dict.values())
                        instance["feature_names"] += list(count_dict.keys())

        logger.info(f"Extracted features for "
                    f"{len(list(itertools.chain(*data_splits)))} instances.")

    def _char_based_features(self, instance, exclude=set()):
        counts = Counter({
            "alpha": 0,
            "upper": 0,
            "lower": 0,
            "numeric": 0,
            "whitespace": 0,
            "comma": 0,
            "dot": 0,
            "exclamation": 0,
            "question": 0,
            "colon": 0,
            "semicolon": 0,
            "hyphen": 0,
            "at": 0,
        })

        for char in instance["text"]:
            if char.isalpha() and "alpha" not in exclude:
                counts["alpha"] += 1
                if char.isupper() and "upper" not in exclude:
                    counts["upper"] += 1
                elif char.islower() and "lower" not in exclude:
                    counts["lower"] += 1

            elif char.isnumeric() and "numeric" not in exclude:
                counts["numeric"] += 1
            elif char.isspace() and "whitespace" not in exclude:
                counts["whitespace"] += 1
            elif char == "," and "comma" not in exclude:
                counts["comma"] += 1
            elif char == "." and "dot" not in exclude:
                counts["dot"] += 1
            elif char == "!" and "exclamation" not in exclude:
                counts["exclamation"] += 1
            elif char == "?" and "question" not in exclude:
                counts["question"] += 1
            elif char == ":" and "colon" not in exclude:
                counts["colon"] += 1
            elif char == ";" and "semicolon" not in exclude:
                counts["semicolon"] += 1
            elif char == "-" and "hyphen" not in exclude:
                counts["hyphen"] += 1
            elif char == "@" and "at" not in exclude:
                counts["at"] += 1

        # normalize counts
        if self.normalize:
            number_of_chars = len(instance["text"])
            for feature, count in counts.items():
                counts[feature] = count / number_of_chars

        return counts

    def _word_based_features(self, instance, exclude=set()):
        counts = Counter()

        counts["stop_words"] = sum(instance["is_stop"])
        counts["emojis"] = sum(instance["is_emoji"])

        token_counts = len(instance["tokens"])

        if self.normalize:
            for feature, count in counts.items():
                counts[feature] = counts[feature] / token_counts

        counts["token_counts"] = token_counts
        counts["avg_token_len"] = mean(len(token)
                                       for token in instance["tokens"])

        for key in exclude:
            counts.pop(key, default=None)

        return counts

    def _pos_features(self, instance, exclude=set()):
        if "pos" in exclude:
            return

        counts = Counter({key: 0 for key in BaseFeaturizer.COARSE_POS_TAGS})
        for pos_tag in instance["pos_tags"]:
            counts[pos_tag] += 1

        if self.normalize:
            token_counts = len(instance["tokens"])
            for feature, count in counts.items():
                counts[feature] = counts[feature] / token_counts

        return counts

    def _ner_features(self, instance, exclude=set()):
        if "ner" in exclude:
            return

        counts = Counter({key: 0 for key in
                          self.spacy_model.pipe_labels["ner"]})
        for named_entity in instance["named_entities"]:
            counts[named_entity] += 1

        if self.normalize:
            token_counts = len(instance["tokens"])
            for feature, count in counts.items():
                counts[feature] = counts[feature] / token_counts

        return counts

    def _add_spacy_annotations(self, data):
        spacy_docs = self.spacy_model.pipe([sample["text"] for sample in data],
                                           n_process=cpu_count())

        for spacy_doc, sample in zip(spacy_docs, data):
            tokens = []
            lemmas = []
            pos_tags = []
            is_stop = []
            is_emoji = []
            for token in spacy_doc:
                tokens.append(token.text)
                lemmas.append(token.lemma_)
                pos_tags.append(token.pos_)
                is_stop.append(token.is_stop)
                is_emoji.append(token._.is_emoji)

            named_entities = [span.label_ for span in spacy_doc.ents]
            sample.update(
                tokens=tokens,
                lemmas=lemmas,
                pos_tags=pos_tags,
                named_entities=named_entities,
                is_stop=is_stop,
                is_emoji=is_emoji,
            )