📝 Config#

class text_machina.src.config.Config(**data)[source]#

Bases: BaseModel

Wrapper for the config.

classmethod load_config(path, task_type, max_generations=None)[source]#
Return type:

Config

classmethod load_configs(path, task_type, max_generations=None)[source]#
Return type:

List[Config]

safe_dataset_name()[source]#
Return type:

str

safe_domain_name()[source]#
Return type:

str

safe_model_name()[source]#
Return type:

str

generation: Dict[str, Any]#
input: InputConfig#
model: ModelConfig#
model_computed_fields: ClassVar[dict[str, ComputedFieldInfo]] = {}#

A dictionary of computed field names and their corresponding ComputedFieldInfo objects.

model_config: ClassVar[ConfigDict] = {'protected_namespaces': ''}#

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

model_fields: ClassVar[dict[str, FieldInfo]] = {'generation': FieldInfo(annotation=Dict[str, Any], required=True), 'input': FieldInfo(annotation=InputConfig, required=True), 'model': FieldInfo(annotation=ModelConfig, required=True), 'path': FieldInfo(annotation=Union[Path, NoneType], required=False, default=None), 'task_type': FieldInfo(annotation=TaskType, required=True)}#

Metadata about the fields defined on the model, mapping of field names to [FieldInfo][pydantic.fields.FieldInfo].

This replaces Model.__fields__ from Pydantic V1.

path: Optional[Path]#
task_type: TaskType#
class text_machina.src.config.InputConfig(**data)[source]#

Bases: BaseModel

Wrapper for the input_config field.

classmethod extractor_must_exist(extractor)[source]#
Return type:

str

classmethod language_must_be_iso639(language)[source]#
Return type:

str

classmethod not_empty_list_in_combined(extractors_list, info)[source]#
Return type:

List[str]

dataset: str#
dataset_params: Dict[str, Any]#
dataset_text_column: str#
domain: str#
extractor: str#
extractor_args: Dict[str, Dict[str, Any]]#
extractors_list: List[str]#
language: str#
max_input_tokens: int#
model_computed_fields: ClassVar[dict[str, ComputedFieldInfo]] = {}#

A dictionary of computed field names and their corresponding ComputedFieldInfo objects.

model_config: ClassVar[ConfigDict] = {}#

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

model_fields: ClassVar[dict[str, FieldInfo]] = {'dataset': FieldInfo(annotation=str, required=True, description='Name (HF Hub) or path to the dataset.'), 'dataset_params': FieldInfo(annotation=Dict[str, Any], required=True, description='Arguments to load the dataset.'), 'dataset_text_column': FieldInfo(annotation=str, required=True, description='Name of column in the dataset containing the text.'), 'domain': FieldInfo(annotation=str, required=True, description='Domain of a dataset.'), 'extractor': FieldInfo(annotation=str, required=True, description='Extractor name.'), 'extractor_args': FieldInfo(annotation=Dict[str, Dict[str, Any]], required=False, default={}, json_schema_extra={'desc': 'Extractors-specific arguments.'}), 'extractors_list': FieldInfo(annotation=List[str], required=False, default=[], description='List of extractors to be used with the `combined` extractor.', validate_default=True), 'language': FieldInfo(annotation=str, required=False, default='en', json_schema_extra={'desc': 'Language of the dataset used.'}, validate_default=True), 'max_input_tokens': FieldInfo(annotation=int, required=False, default=256, json_schema_extra={'desc': 'Maximum token length to be distributed across the prompt inputs extracted with the extractors.'}, metadata=[Gt(gt=0)]), 'quantity': FieldInfo(annotation=int, required=True, description='Number of samples to generate.', metadata=[Gt(gt=0)]), 'random_sample_human': FieldInfo(annotation=bool, required=False, default=False, json_schema_extra={'desc': 'Whether to randomly sample human texts or use the same ones used to generate MGT'}), 'template': FieldInfo(annotation=str, required=True, description='Template for the generations.')}#

Metadata about the fields defined on the model, mapping of field names to [FieldInfo][pydantic.fields.FieldInfo].

This replaces Model.__fields__ from Pydantic V1.

quantity: int#
random_sample_human: bool#
template: str#
class text_machina.src.config.ModelConfig(**data)[source]#

Bases: BaseModel

Wrapper for the input_config field.

classmethod provider_must_exist(provider)[source]#
Return type:

str

api_type: Literal['CHAT', 'COMPLETION']#
model_computed_fields: ClassVar[dict[str, ComputedFieldInfo]] = {}#

A dictionary of computed field names and their corresponding ComputedFieldInfo objects.

model_config: ClassVar[ConfigDict] = {'extra': 'allow', 'protected_namespaces': ''}#

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

model_fields: ClassVar[dict[str, FieldInfo]] = {'api_type': FieldInfo(annotation=Literal['CHAT', 'COMPLETION'], required=False, default='COMPLETION', description='API type for providers that allows chat and completion endpoints.This arg must be `CHAT` or `COMPLETION` and must be according tothe model used:\n- `CHAT`: for chat completion endpoints.\n- `COMPLETION` for traditional completion endpoints.\nFor instance, GPT-4 in OpenAI can only be used with `CHAT`.'), 'model_name': FieldInfo(annotation=str, required=True, description='Name of a text generation model.'), 'provider': FieldInfo(annotation=str, required=True, description='Provider of text generation models.'), 'threads': FieldInfo(annotation=int, required=False, default=8, description='Number of threads to use in `generate_completions`', metadata=[Gt(gt=0)])}#

Metadata about the fields defined on the model, mapping of field names to [FieldInfo][pydantic.fields.FieldInfo].

This replaces Model.__fields__ from Pydantic V1.

model_name: str#
provider: str#
threads: int#
text_machina.src.config.parse_metrics_config(path)[source]#

Parses a metrics config.

Parameters:

path (Path) – the metric config path to parse.

Returns:

a tuple of structure (list of metrci names, args).

Return type:

Tuple[List[str], Dict]