Source code for text_classification.classifier.class_average

import logging
from collections import defaultdict
import csv

from sklearn.metrics import classification_report

from text_classification.classifier.base import BaseClassifier

logger = logging.getLogger(__name__)


[docs]class ClassAverageClassifier(BaseClassifier): """ A classifier that computes average feature values for each class and predicts the class whose average feature vector is most similar to the instance to predict tha class for. """
[docs] def __init__(self): self.feature_names = [] self.labels = [] self._average_feature_values = dict()
[docs] def train(self, preprocessor): """ Computes the average feature vector for each class in the pre- processor's train set. :param preprocessor: Preprocessor instance that contains a train set and has been already featurized, i.e. each train instance should contain the keys "feature_vector", "feature_names" and "label". :type preprocessor: BasePreprocessor :return: ClassAverageClassifier """ train_set = preprocessor.get_train_data() if not train_set: logger.warning("Classifier won't be trained as Preprocessor's " "train set is empty.") return self logger.info(f"Training the classifier on {len(train_set)} training " f"instances...") # split train set in different classes split_by_label = defaultdict(list) for instance in train_set: split_by_label[instance["label"]].append(instance) self.labels = list(split_by_label.keys()) # calculate average feature vector for each class for label in split_by_label: # group features values of same feature together grouped_feature_values = zip( *[instance["feature_vector"] for instance in split_by_label[label]] ) self._average_feature_values[label] = [sum(feature) / len(split_by_label[label]) for feature in grouped_feature_values] self.feature_names = train_set[0]["feature_names"] logger.info("Training done.") # evaluate on dev set if preprocessor.get_dev_data(): self.evaluate(preprocessor, evaluate_test=False, evaluate_dev=True) return self
[docs] def evaluate(self, preprocessor, evaluate_test=True, evaluate_dev=False): """ Evaluates the current model on the preprocessor's test and/or dev set and prints a classification report containing accuracy, precision, recall and F1-scores. :param preprocessor: Preprocessor containing dev/test samples. :type preprocessor: BasePreprocessor :param evaluate_test: Whether to evaluate on the test set. :type evaluate_test: bool :param evaluate_dev: Whether to evaluate on dev set. :type evaluate_dev: bool """ # make predictions self.predict(preprocessor, predict_train=False, predict_test=evaluate_test, predict_dev=evaluate_dev) # calculate evaluation scores if evaluate_test: predictions = [] gold_labels = [] for instance in preprocessor.get_test_data(): predictions.append(instance["prediction"]) gold_labels.append(instance["label"]) assert len(gold_labels) == len(predictions), \ "Label list and prediction list are not of same length." assert len(gold_labels) > 0, \ "Evaluation on empty test set is not possible." logger.info(f"\n___Evaluation metrics on test set___\n" f"{classification_report(gold_labels, predictions)}") if evaluate_dev: predictions = [] gold_labels = [] for instance in preprocessor.get_dev_data(): predictions.append(instance["prediction"]) gold_labels.append(instance["label"]) assert len(gold_labels) == len(predictions), \ "Label list and prediction list are not of same length." assert len(gold_labels) > 0, \ "Evaluation on empty test set is not possible." logger.info(f"\n___Evaluation metrics on dev set___\n" f"{classification_report(gold_labels, predictions)}")
[docs] def predict(self, preprocessor, predict_train=False, predict_test=True, predict_dev=False): """ Makes predictions for samples inside preprocessor in-place, i.e. for each instance, a key 'prediction' containing the prediction is added. Instances have to be featurized before using the same Featurizer that was used for training instances. :param preprocessor: Preprocessor containing the samples to make predictions on. :type preprocessor: BasePreprocessor :param predict_train: Whether to make predictions on the train set. :type predict_train: bool :param predict_test: Whether to make predictions on the test set. :type predict_test: bool :param predict_dev: Whether to make predictions on the dev set. :type predict_dev: bool """ if predict_train: train_set = preprocessor.get_train_data() logger.info(f"Making predictions on {len(train_set)} instances in " f"train set.") self.predict_from_dicts(train_set) if predict_test: test_set = preprocessor.get_test_data() logger.info(f"Making predictions on {len(test_set)} instances in " f"test set.") self.predict_from_dicts(test_set) if predict_dev: dev_set = preprocessor.get_dev_data() logger.info(f"Making predictions on {len(dev_set)} instances in " f"dev set.") self.predict_from_dicts(dev_set)
[docs] def predict_from_dicts(self, dicts): """ Make predictions on a a list of dictionaries. Dictionaries must contain key 'feature_vector' consisting of the feature vector. :param dicts: List of dicts, where each dict represents an instance, :type dicts: List[dict] :return: Updated list of dictionaries. """ for instance in dicts: if "feature_vector" in instance: assert (len(instance["feature_vector"]) == len(self.feature_names) and instance["feature_names"] == self.feature_names), ( "Vectors of instances to predict and classifier " "doesn't match up. Make sure to use the same Featurizer!") most_similar_label = self._get_most_similar_label(instance) instance["prediction"] = most_similar_label else: raise KeyError("Instance to predict doesn't contain feature " "vector. Make sure to apply first a Featurizer!") return dicts
[docs] def save_average_feature_vectors(self, filename, delimiter="\t", label_col="label"): """ Saves the trained average vectors to a CSV-file. :param filename: File where average vectors should be saved. :type filename: str :param delimiter: Delimiter used in CSV-file. :type delimiter: str :param label_col: Name of label column. :type label_col: str """ logger.info(f"Saving average feature vectors to {filename}...") with open(filename, "w") as file: csv_writer = csv.writer(file, delimiter=delimiter) csv_writer.writerow([label_col] + self.feature_names) for label in self._average_feature_values: csv_writer.writerow([label] + self._average_feature_values[label])
[docs] @classmethod def load_average_feature_vectors(cls, filename, delimiter="\t", label_col="label"): """ Loads trained average vectors from a CSV-file and instantiates a ClassAverageClassifier instance- :param filename: File where average vectors are saved :type filename: str :param delimiter: Delimiter used in CSV-file. :type delimiter: str :param label_col: Name of label column. :type label_col: str :return: ClassAverageClassifier instance. """ logger.info(f"Loading average feature vectors from {filename}...") classifier = cls() with open(filename, "r") as file: csv_reader = csv.reader(file, delimiter=delimiter) headers = next(csv_reader) label_col_idx = headers.index(label_col) headers.pop(label_col_idx) classifier.feature_names = headers for row in csv_reader: label = row[label_col_idx] if label not in classifier.labels: classifier.labels.append(label) row.pop(label_col_idx) classifier._average_feature_values[label] = [float(val) for val in row] return classifier
def _get_most_similar_label(self, instance): # Computes similarity value of instance with average feature vector for # each class. The smaller similarity value, the more similar are # instance and average feature vector. similarity_values = defaultdict(float) instance_vector = instance["feature_vector"] for label in self._average_feature_values: average_vector = self._average_feature_values[label] similarity = sum([abs(instance_val - average_val) for instance_val, average_val in zip(instance_vector, average_vector)]) similarity_values[label] = similarity most_similar_label = min(similarity_values, key=similarity_values.get) return most_similar_label