Source code for text_classification.preprocessor.csv_preprocessor

import csv
import random
import logging

from text_classification.preprocessor.base import BasePreprocessor

logger = logging.getLogger(__name__)


[docs]class CSVPreprocessor(BasePreprocessor): """ Preprocessor that is able to read a csv-file and do train/test/dev split. A preprocessor instance serves as a samples storage whose instances can be extended with feature vectors and predictions. """
[docs] def __init__(self, train_filename=None, test_filename=None, dev_filename=None, test_split=0, dev_split=0, delimiter="\t", text_column="text", label_column="label", random_state=None): """ :param train_filename: Train set file. :type train_filename: str :param test_filename: Test set file. :type test_filename: str :param dev_filename: Dev set file. :type dev_filename: str :param test_split: Fraction of train set that should be used as test set. :type test_split: float :param dev_split: Fraction of train set that should be used as dev set. :type dev_split: float :param delimiter: Delimiter that is used in csv-file :type delimiter: str :param text_column: Column in csv-file containing text. :type text_column: str :param label_column: Column in csv-file containing label. :type label_column: str :param random_state: Random state for shuffling samples. :type random_state: int """ random.seed(random_state) if train_filename: logger.info(f"Reading {train_filename}...") data = self._extract_data(train_filename, delimiter, text_column, label_column) # shuffle samples if we want to use part of it as train or dev set if test_split or dev_split: random.shuffle(data) # check whether test_split and dev_split are valid if not (0 <= test_split <= 1): raise ValueError( f"test_split should be between 0 and 1. " f"test_split is: {test_split}" ) if not (0 <= dev_split <= 1): raise ValueError( f"dev_split should be between 0 and 1. " f"dev_split is: {dev_split}" ) if dev_split + test_split > 1: raise ValueError( f"Sum of test_split and dev_split must not be greater " f"than 1. Sum is: {dev_split + test_split}" ) # calculate number of dev and test samples number_of_test_samples = int(len(data) * test_split) number_of_dev_samples = int(len(data) * dev_split) # split samples into train, test and dev sets self.test = data[:number_of_test_samples] self.dev = data[number_of_test_samples: number_of_test_samples+number_of_dev_samples] self.train = data[number_of_test_samples+number_of_dev_samples:] # add external test and dev samples if test_filename: logging.info(f"Reading {test_filename}...") self.test += self._extract_data(test_filename, delimiter, text_column, label_column) if dev_filename: logging.info(f"Reading {dev_filename}...") self.dev += self._extract_data(dev_filename, delimiter, text_column, label_column) else: self.train = [] if test_filename: logging.info(f"Reading {test_filename}...") self.test = self._extract_data(test_filename, delimiter, text_column, label_column) else: self.test = [] if dev_filename: logging.info(f"Reading {dev_filename}...") self.dev = self._extract_data(dev_filename, delimiter, text_column, label_column) else: self.dev = []
[docs] def get_data(self): """ Returns a tuple containing train, test and dev set. :return: Tuple with train, test and dev set. """ return self.train, self.test, self.dev
[docs] def get_train_data(self): """ Returns train set. :return: Train set. """ return self.train
[docs] def get_test_data(self): """ Returns test set. :return: Test set. :rtype: List[dict] """ return self.test
[docs] def get_dev_data(self): """ Returns dev set. :return: Dev set. """ return self.dev
[docs] @classmethod def from_file(cls, train_filename=None, test_filename=None, dev_filename=None, test_split=0, dev_split=0, delimiter="\t", text_column="text", label_column="label", random_state=None): """ Load samples from csv-files. :param train_filename: Train set file. :type train_filename: str :param test_filename: Test set file. :type test_filename: str :param dev_filename: Dev set file. :type dev_filename: str :param test_split: Fraction of train set that should be used as test set. :type test_split: float :param dev_split: Fraction of train set that should be used as dev set. :type dev_split: float :param delimiter: Delimiter that is used in csv-file. :type delimiter: str :param text_column: Column in csv-file containing text. :type text_column: str :param label_column: Column in csv-file containing label. :type label_column: str :param random_state: Random state for shuffling samples. :type random_state: int :return: CSVPreprocessor instance """ return cls(train_filename, test_filename, dev_filename, test_split, dev_split, delimiter, text_column, label_column, random_state)
[docs] def write_csv(self, filename, delimiter="\t", set="test"): """ Write samples (i.e. text, label, prediction) to a csv-file. :param filename: File to write the samples to. :type filename: str :param delimiter: Delimiter that is used in csv-file. :type delimiter: str :param set: Which samples set to write. Possible values: "train", "test", "dev" :type set: str """ logger.info(f"Writing {set} samples to {filename}...") if set == "test": self._write_csv(filename, delimiter, self.get_test_data()) elif set == "dev": self._write_csv(filename, delimiter, self.get_dev_data()) elif set == "train": self._write_csv(filename, delimiter, self.get_train_data()) else: raise ValueError(f"Arg set has to be one of the following values:" f" 'test', 'train', 'dev'. Arg set is: {set}")
def _write_csv(self, filename, delimiter, set): with open(filename, "w") as file: csv_writer = csv.writer(file, delimiter=delimiter) csv_writer.writerow(["text", "label", "prediction"]) for instance in set: row = [instance.get("text", ""), instance.get("label", ""), instance.get("prediction", "")] csv_writer.writerow(row)
[docs] def write_feature_vectors(self, filename, delimiter="\t", set="train"): """ Write extracted features to a csv-file. :param filename: File to write the feature vectors to. :type filename: str :param delimiter: Delimiter that is used in csv-file. :type delimiter: str :param set: From which samples set to write the feature vectors. Possible values: "train", "test", "dev" :type set: str """ logger.info(f"Writing {set} feature vectors to {filename}...") if set == "test": self._write_csv(filename, delimiter, self.get_test_data()) elif set == "dev": self._write_csv(filename, delimiter, self.get_dev_data()) elif set == "train": self._write_csv(filename, delimiter, self.get_train_data()) else: raise ValueError(f"Arg set has to be one of the following values:" f" 'test', 'train', 'dev'. Arg set is: {set}")
def _write_feature_vectors(self, filename, delimiter, set): with open(filename, "w") as file: csv_writer = csv.writer(file, delimiter=delimiter) try: feat_names = set[0]["feature_names"] csv_writer.writerow(["text", "label", "prediction"] + feat_names) except IndexError: logger.warning("Cannot write feature vectors for empty set.") except KeyError: logger.warning("No feature vectors available. Please extract " "features before wanting to write feature " "vectors.") for instance in set: row = [instance.get("text", ""), instance.get("label", ""), instance.get("prediction", "")] + \ instance.get("feature_names", [0 for feat in feat_names]) csv_writer.writerow(row) @staticmethod def _extract_data(filename, delimiter, text_column, label_column): with open(filename, "r") as file: csv_reader = csv.reader(file, delimiter=delimiter) try: headers = next(csv_reader) except StopIteration: raise EOFError(f"'{filename}' is empty. Please provide a " f"non-empty file or set filename to 'None' if " f"you want to use an empty CSVPreprocessor.") try: text_col_idx = headers.index(text_column) except ValueError: raise ValueError(f"'{text_column}' not a column of " f"{filename}. Please provide text column" f"name.") try: label_col_idx = headers.index(label_column) data = [{"text": row[text_col_idx], "label": row[label_col_idx]} for row in csv_reader] except ValueError: logger.warning(f"Reading data from {filename} without label, " f"as column {label_column} does not exist.") data = [{"text": row[text_col_idx]} for row in csv_reader] return data