Source code for text_machina.src.metrics.perplexity

from pathlib import Path

import evaluate
import pandas as pd
from datasets import Dataset

from ..common.exceptions import InvalidTaskTypeForMetric
from ..types import TaskType
from .base import Metric


[docs]class PerplexityMetric(Metric): """ Implements the perplexity metric. Supported tasks: detection, attribution, and boundary. """ def _run(self, dataset: Dataset, **kwargs) -> pd.DataFrame: metric = evaluate.load("perplexity", module_type="metric") # If we evaluate complete texts we can run easily grouping by labels if self.task_type in {TaskType.DETECTION, TaskType.ATTRIBUTION}: label = dataset["label"] text = dataset["text"] # Otherwise we need to separate the sections that pertain to different labels elif self.task_type == TaskType.BOUNDARY: df = dataset.to_pandas()[["text", "label"]] human_text = df.apply( lambda x: x["text"][: x["label"]], axis=1 ).tolist() generated_text = df.apply( lambda x: x["text"][x["label"] :], axis=1 ).tolist() text = human_text + generated_text label = ["human"] * len(human_text) + ["generated"] * len( generated_text ) else: raise InvalidTaskTypeForMetric(self.name, self.task_type) result = metric.compute(predictions=text, **kwargs) df = pd.DataFrame( {"label": label, "perplexity": result["perplexities"]} ) return df def _save(self, outputs: pd.DataFrame, path: Path) -> None: outputs.to_csv(path / "full_outputs.csv") outputs.drop("label", axis=1).mean(axis=0).to_json( path / "summary.json", indent=4 ) outputs.groupby("label").mean().reset_index().to_json( path / "per_label_summary.json", indent=4 ) def _log(self, outputs: pd.DataFrame, logger) -> None: summary = outputs.groupby("label").mean().reset_index() if self.task_type == TaskType.BOUNDARY: logger.info(f"Mean perplexity of segments: {summary.to_dict()}") else: logger.info(f"Mean perplexity: {summary.to_dict()}")