Skip to content

value

DataProfilerModule (KiaraModule)

Generate a data profile report for a dataset.

This uses the DataProfiler Python library, check out its documentation for more details.

Source code in core/value.py
class DataProfilerModule(KiaraModule):
    """Generate a data profile report for a dataset.

    This uses the [DataProfiler](https://capitalone.github.io/DataProfiler/docs/0.7.0/html/index.html) Python library,
    check out its documentation for more details.
    """

    _module_type_name = "data_profile"
    _config_cls = DataProfilerModuleConfig

    @classmethod
    def retrieve_module_profiles(
        cls, kiara: "Kiara"
    ) -> typing.Mapping[str, typing.Union[typing.Mapping[str, typing.Any], Operation]]:

        supported_source_types = ["table", "file"]

        doc = cls.get_type_metadata().documentation
        all_profiles = {}
        for sup_type in supported_source_types:

            op_config = {
                "module_type": cls._module_type_id,  # type: ignore
                "module_config": {"value_type": sup_type},
                "doc": doc,
            }
            all_profiles[f"profile.{sup_type}.data"] = op_config

        return all_profiles

    def create_input_schema(
        self,
    ) -> typing.Mapping[
        str, typing.Union[ValueSchema, typing.Mapping[str, typing.Any]]
    ]:

        inputs: typing.Mapping[str, typing.Mapping[str, typing.Any]] = {
            "item": {
                "type": self.get_config_value("value_type"),
                "doc": f"The {self.get_config_value('value_type')} to profile.",
            }
        }
        return inputs

    def create_output_schema(
        self,
    ) -> typing.Mapping[
        str, typing.Union[ValueSchema, typing.Mapping[str, typing.Any]]
    ]:

        outputs: typing.Mapping[str, typing.Mapping[str, typing.Any]] = {
            "report": {"type": "dict", "doc": "Statistics/details about the dataset."}
        }
        return outputs

    def process(self, inputs: ValueSet, outputs: ValueSet) -> None:

        import pyarrow as pa
        from dataprofiler import Data, Profiler, ProfilerOptions, set_verbosity

        set_verbosity(logging.WARNING)

        value_type = self.get_config_value("value_type")

        profile_options = ProfilerOptions()
        profile_options.structured_options.data_labeler.is_enabled = False
        profile_options.unstructured_options.data_labeler.is_enabled = False

        if value_type == "table":
            table_item: pa.Table = inputs.get_value_data("item")
            pd = table_item.to_pandas()
            profile = Profiler(
                pd, options=profile_options
            )  # Calculate Statistics, Entity Recognition, etc
            report = profile.report()

        elif value_type == "file":
            file_item: KiaraFile = inputs.get_value_data("item")
            data = Data(file_item.path)
            profile = Profiler(data, options=profile_options)
            report = profile.report()
        else:
            raise KiaraProcessingException(
                f"Data profiling of value type '{value_type}' not supported."
            )

        outputs.set_value("report", report)

create_input_schema(self)

Abstract method to implement by child classes, returns a description of the input schema of this module.

If returning a dictionary of dictionaries, the format of the return value is as follows (items with '*' are optional):

{ "[input_field_name]: { "type": "[value_type]", "doc*": "[a description of this input]", "optional*': [boolean whether this input is optional or required (defaults to 'False')] "[other_input_field_name]: { "type: ... ... }

Source code in core/value.py
def create_input_schema(
    self,
) -> typing.Mapping[
    str, typing.Union[ValueSchema, typing.Mapping[str, typing.Any]]
]:

    inputs: typing.Mapping[str, typing.Mapping[str, typing.Any]] = {
        "item": {
            "type": self.get_config_value("value_type"),
            "doc": f"The {self.get_config_value('value_type')} to profile.",
        }
    }
    return inputs

create_output_schema(self)

Abstract method to implement by child classes, returns a description of the output schema of this module.

If returning a dictionary of dictionaries, the format of the return value is as follows (items with '*' are optional):

{ "[output_field_name]: { "type": "[value_type]", "doc*": "[a description of this output]" "[other_input_field_name]: { "type: ... ... }

Source code in core/value.py
def create_output_schema(
    self,
) -> typing.Mapping[
    str, typing.Union[ValueSchema, typing.Mapping[str, typing.Any]]
]:

    outputs: typing.Mapping[str, typing.Mapping[str, typing.Any]] = {
        "report": {"type": "dict", "doc": "Statistics/details about the dataset."}
    }
    return outputs

retrieve_module_profiles(kiara) classmethod

Retrieve a collection of profiles (pre-set module configs) for this kiara module type.

This is used to automatically create generally useful operations (incl. their ids).

Source code in core/value.py
@classmethod
def retrieve_module_profiles(
    cls, kiara: "Kiara"
) -> typing.Mapping[str, typing.Union[typing.Mapping[str, typing.Any], Operation]]:

    supported_source_types = ["table", "file"]

    doc = cls.get_type_metadata().documentation
    all_profiles = {}
    for sup_type in supported_source_types:

        op_config = {
            "module_type": cls._module_type_id,  # type: ignore
            "module_config": {"value_type": sup_type},
            "doc": doc,
        }
        all_profiles[f"profile.{sup_type}.data"] = op_config

    return all_profiles

DataProfilerModuleConfig (ModuleTypeConfigSchema) pydantic-model

Source code in core/value.py
class DataProfilerModuleConfig(ModuleTypeConfigSchema):

    value_type: str = Field(description="The value type to profile.")

value_type: str pydantic-field required

The value type to profile.