Source code for text_machina.src.generators.detection
from typing import List
from datasets import Dataset, concatenate_datasets
from ..common.exceptions import DatasetGenerationError
from ..config import Config
from ..types import DetectionLabels, Placeholders
from .base import DatasetGenerator
[docs]class DetectionDatasetGenerator(DatasetGenerator):
"""
Dataset generator for the detection task type.
"""
def __init__(self, config: Config) -> None:
super().__init__(config=config)
def _pack(self, generations: List[str], **kwargs) -> Dataset:
"""
Combines and labels the generated and human texts.
Args:
generations (List[str]): list of generated texts.
kwargs: additional keyword arguments.
Returns:
Dataset: a dataset including all the texts.
"""
prompted_dataset = kwargs.get("prompted_dataset", None)
if prompted_dataset is None:
raise DatasetGenerationError(f"prompted_dataset not found: {self}")
model_name = self.config.model.model_name
domain = self.config.input.domain
extractor = self.config.input.extractor
generated_dataset = Dataset.from_list(
[
{
"prompt": prompt,
"text": text,
"label": DetectionLabels.GENERATED.value,
"model": model_name,
"domain": domain,
"extractor": extractor,
}
for prompt, text in zip(
prompted_dataset.prompted_texts, generations
)
]
)
human_dataset = Dataset.from_list(
[
{
"prompt": Placeholders.NO_PROMPT.value,
"text": text,
"label": DetectionLabels.HUMAN.value,
"model": DetectionLabels.HUMAN.value,
"domain": domain,
"extractor": Placeholders.NO_EXTRACTOR.value,
}
for text in prompted_dataset.human_texts
]
)
dataset = concatenate_datasets([human_dataset, generated_dataset])
dataset = dataset.shuffle()
return dataset