Source code for text_machina.src.metrics.token_classification

from pathlib import Path
from typing import Dict, List

import evaluate
import numpy as np
import pandas as pd
from datasets import Dataset, DatasetDict
from transformers import (
    AutoModelForTokenClassification,
    AutoTokenizer,
    DataCollatorForTokenClassification,
    Trainer,
    TrainingArguments,
)

from ..common.exceptions import InvalidTaskTypeForMetric
from ..types import DetectionLabels, TaskType
from .base import Metric


[docs]def prepare_tags_for_mixcase(
    offset_mappings: List[List[List[int]]],
    labels: List[List[Dict]],
    label_mapping: Dict[str, int],
) -> Dict[str, List[List[int]]]:
    """
    Prepares the labels for mixcase tasks to be addressed
    as token classification tasks.

    This fn is designed to work with the `map` HF's function
    using `batched=True`.

    Args:
        offset_mappings (List[List[List[int]]]): offset mappings of each text.
        labels (List[List[Dict]]): labels of each text.
        label_mapping (Dict[str, int]): label mapping str to int labels.

    Returns:
        Dict[str, List[List[int]]]: the tags for each text in the batch.
    """
    tags = []
    for offset_mapping, label in zip(offset_mappings, labels):
        sample_tags = [-100]
        for _, char_end in offset_mapping[1:-1]:
            if char_end < label[0]["end"]:
                sample_tags.append(label_mapping[label[0]["label"]])
            else:
                if len(label) > 1:
                    label.pop(0)
                sample_tags.append(label_mapping[label[0]["label"]])
        sample_tags.append(-100)
        tags.append(sample_tags)
    return {"labels": tags}


[docs]def prepare_tags_for_boundary(
    offset_mappings: List[List[List[int]]],
    labels: List[int],
    label_mapping: Dict[str, int],
) -> Dict[str, List[List[int]]]:
    """
    Prepares the labels for boundary tasks to be addressed
    as token classification tasks.

    This fn is designed to work with the `map` HF's function
    using `batched=True`.

    Args:
        offset_mappings (List[List[List[int]]]): offset mappings of each text.
        labels (List[int): labels of each text.
        label_mapping (Dict[str, int]): label mapping str to int labels.

    Returns:
        Dict[str, List[List[int]]]: the tags for each text in the batch.
    """
    tags = []
    for offset_mapping, label in zip(offset_mappings, labels):
        sample_tags = [-100]
        for _, char_end in offset_mapping[1:-1]:
            if char_end <= label:
                sample_tags.append(label_mapping[DetectionLabels.HUMAN.value])
            else:
                sample_tags.append(
                    label_mapping[DetectionLabels.GENERATED.value]
                )
        sample_tags.append(-100)
        tags.append(sample_tags)
    return {"labels": tags}


[docs]def prepare_dataset(
    dataset: Dataset,
    task_type: TaskType,
    tokenizer: AutoTokenizer,
    test_size: float,
    label_mapping: Dict[str, int],
) -> DatasetDict:
    """
    Prepares the dataset (tokenization, prepare labels, splitting, etc.).

    Args:
        dataset (Dataset): a dataset.
        tokenizer (AutoTokenizer): a tokenizer.
        test_size (float): proportion reserved for the test set.
        label_mapping (Dict[str, int]): label mapping str to int labels.

    Returns:
        DatasetDict: a dataset with train and test splits.
    """
    dataset = dataset.map(
        lambda batch: tokenizer(
            batch, truncation=True, return_offsets_mapping=True
        ),
        input_columns=["text"],
        batched=True,
    )
    dataset = dataset.map(
        (
            prepare_tags_for_mixcase
            if task_type == TaskType.MIXCASE
            else prepare_tags_for_boundary
        ),
        input_columns=["offset_mapping", "label"],
        batched=True,
        fn_kwargs={"label_mapping": label_mapping},
    )
    dataset = dataset.select_columns(["input_ids", "attention_mask", "labels"])
    dataset = dataset.train_test_split(test_size=test_size)
    return dataset


[docs]def fit(
    model: AutoModelForTokenClassification,
    dataset: Dataset,
    tokenizer: AutoTokenizer,
    training_args: Dict,
) -> None:
    """
    Fits a model on a dataset.

    Args:
        model (AutoModelForTokenClassification): a model.
        dataset (Dataset): a training dataset.
        tokenizer (AutoTokenizer): a tokenizer.
        training_args (Dict): args to be passed to the HF's Trainer.
    """
    training_args = TrainingArguments(do_train=True, **training_args)
    collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
    trainer = Trainer(
        model,
        train_dataset=dataset,
        data_collator=collator,
        args=training_args,
    )
    trainer.train()


[docs]def predict(
    model: AutoModelForTokenClassification,
    dataset: Dataset,
    tokenizer: AutoTokenizer,
) -> List[List[int]]:
    """
    Predicts a dataset using a model.

    Args:
        model (AutoModelForTokenClassification): a model.
        dataset (Dataset): a test dataset.
        tokenizer (AutoTokenizer): a tokenizer.

    Returns:
        List[List[int]]: list of predicted labels for each example.
    """
    collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
    trainer = Trainer(
        model,
        data_collator=collator,
    )
    predictions = trainer.predict(dataset).predictions
    return np.argmax(predictions, axis=-1)


[docs]def eval(
    predictions: List[List[int]],
    references: List[List[int]],
    label_mapping: Dict[int, str],
) -> Dict[str, float]:
    """
    Evaluates using `seqeval` from HF metrics.

    Args:
        predictions (List[List[int]]): list of predictions for each example.
        references (List[List[int]]): list of gold labels for each example.
        label_mapping (Dict[int, str]): label mapping int to str labels.

    Returns:
        Dict[str, float] -> dictionary with metric values.
    """
    preds = [
        [label_mapping[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, references)
    ]

    refs = [
        [label_mapping[l] for (_, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(preds, references)
    ]

    preds = [pred[1:] for pred in preds]

    seqeval = evaluate.load("seqeval")
    results = seqeval.compute(predictions=preds, references=refs)

    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }


[docs]class TokenClassificationMetric(Metric):
    """
    Implements a HF token classification model
    for evaluating a mixcase dataset.

    Supported tasks: boundary and mixcase.
    """

    def _run(self, dataset: Dataset, **kwargs) -> pd.DataFrame:
        if self.task_type not in [TaskType.MIXCASE, TaskType.BOUNDARY]:
            raise InvalidTaskTypeForMetric(self.name, self.task_type)

        model = AutoModelForTokenClassification.from_pretrained(
            **kwargs["model_args"]
        )
        tokenizer = AutoTokenizer.from_pretrained(
            kwargs["model_args"]["pretrained_model_name_or_path"]
        )

        dataset = prepare_dataset(
            dataset,
            self.task_type,
            tokenizer,
            kwargs["test_size"],
            kwargs["label_mapping"],
        )

        fit(model, dataset["train"], tokenizer, kwargs["training_args"])
        predictions = predict(model, dataset["test"], tokenizer)

        results = eval(
            predictions,
            dataset["test"]["labels"],
            {v: k for k, v in kwargs["label_mapping"].items()},
        )
        return pd.DataFrame(results, index=[0])

    def _save(self, outputs: pd.DataFrame, path: Path) -> None:
        outputs.to_csv(path / "full_outputs.csv", index=False)

    def _log(self, outputs: pd.DataFrame, logger) -> None:
        logger.info(f"Seqeval results: {outputs}")