Source code for text_machina.src.extractors.noun_list
from typing import Dict, List, Set
import spacy
from datasets import Dataset
from ..config import InputConfig
from ..types import TaskType
from .base import Extractor
from .types import EXTRACTOR_OMITTED
from .utils import spacy_pipeline
[docs]def extract_nouns(processed_text: spacy.tokens.Doc) -> Set[str]:
"""
Extracts noun chunks from a Spacy doc.
Args:
processed_text (Doc): Spacy doc.
Returns:
Set[str]: noun chunks in the doc.
"""
# take noun chunks -> nouns -> roots -> disregard
nouns = [x.text for x in processed_text.noun_chunks]
if not nouns:
nouns = [x.text for x in processed_text if x.pos_ == "NOUN"]
if not nouns:
nouns = [x.text for x in processed_text if x.dep_ == "ROOT"]
else:
nouns = [EXTRACTOR_OMITTED] # to filter after
return set(nouns)
[docs]class NounList(Extractor):
"""
Extractor that fills the prompt template with noun-phrases
extracted from a text column in the dataset.
This extractor needs a template placeholder named {nouns}.
This extractor does not need specific arguments.
"""
def __init__(self, input_config: InputConfig, task_type: TaskType):
super().__init__(input_config, task_type)
def _extract(self, dataset: Dataset) -> Dict[str, List[str]]:
processed_texts = spacy_pipeline(
dataset[self.input_config.dataset_text_column],
language=self.input_config.language,
disable_pipes=[
"ner",
"senter",
"attribute_ruler",
"lemmatizer",
],
)
nouns = [", ".join(extract_nouns(text)) for text in processed_texts]
return {"nouns": nouns}