Source code for text_machina.src.extractors.noun_list

from typing import Dict, List, Set

import spacy
from datasets import Dataset

from ..config import InputConfig
from ..types import TaskType
from .base import Extractor
from .types import EXTRACTOR_OMITTED
from .utils import spacy_pipeline


[docs]def extract_nouns(processed_text: spacy.tokens.Doc) -> Set[str]:
    """
    Extracts noun chunks from a Spacy doc.

    Args:
        processed_text (Doc): Spacy doc.

    Returns:
        Set[str]: noun chunks in the doc.
    """
    # take noun chunks -> nouns -> roots -> disregard
    nouns = [x.text for x in processed_text.noun_chunks]
    if not nouns:
        nouns = [x.text for x in processed_text if x.pos_ == "NOUN"]
    if not nouns:
        nouns = [x.text for x in processed_text if x.dep_ == "ROOT"]
    else:
        nouns = [EXTRACTOR_OMITTED]  # to filter after
    return set(nouns)


[docs]class NounList(Extractor):
    """
    Extractor that fills the prompt template with noun-phrases
    extracted from a text column in the dataset.

    This extractor needs a template placeholder named {nouns}.

    This extractor does not need specific arguments.
    """

    def __init__(self, input_config: InputConfig, task_type: TaskType):
        super().__init__(input_config, task_type)

    def _extract(self, dataset: Dataset) -> Dict[str, List[str]]:
        processed_texts = spacy_pipeline(
            dataset[self.input_config.dataset_text_column],
            language=self.input_config.language,
            disable_pipes=[
                "ner",
                "senter",
                "attribute_ruler",
                "lemmatizer",
            ],
        )
        nouns = [", ".join(extract_nouns(text)) for text in processed_texts]
        return {"nouns": nouns}