value
        
DataProfilerModule            (KiaraModule)
        
¶
    Generate a data profile report for a dataset.
This uses the DataProfiler Python library, check out its documentation for more details.
Source code in core/value.py
          class DataProfilerModule(KiaraModule):
    """Generate a data profile report for a dataset.
    This uses the [DataProfiler](https://capitalone.github.io/DataProfiler/docs/0.7.0/html/index.html) Python library,
    check out its documentation for more details.
    """
    _module_type_name = "data_profile"
    _config_cls = DataProfilerModuleConfig
    @classmethod
    def retrieve_module_profiles(
        cls, kiara: "Kiara"
    ) -> typing.Mapping[str, typing.Union[typing.Mapping[str, typing.Any], Operation]]:
        supported_source_types = ["table", "file"]
        doc = cls.get_type_metadata().documentation
        all_profiles = {}
        for sup_type in supported_source_types:
            op_config = {
                "module_type": cls._module_type_id,  # type: ignore
                "module_config": {"value_type": sup_type},
                "doc": doc,
            }
            all_profiles[f"profile.{sup_type}.data"] = op_config
        return all_profiles
    def create_input_schema(
        self,
    ) -> typing.Mapping[
        str, typing.Union[ValueSchema, typing.Mapping[str, typing.Any]]
    ]:
        inputs: typing.Mapping[str, typing.Mapping[str, typing.Any]] = {
            "item": {
                "type": self.get_config_value("value_type"),
                "doc": f"The {self.get_config_value('value_type')} to profile.",
            }
        }
        return inputs
    def create_output_schema(
        self,
    ) -> typing.Mapping[
        str, typing.Union[ValueSchema, typing.Mapping[str, typing.Any]]
    ]:
        outputs: typing.Mapping[str, typing.Mapping[str, typing.Any]] = {
            "report": {"type": "dict", "doc": "Statistics/details about the dataset."}
        }
        return outputs
    def process(self, inputs: ValueSet, outputs: ValueSet) -> None:
        import pyarrow as pa
        from dataprofiler import Data, Profiler, ProfilerOptions, set_verbosity
        set_verbosity(logging.WARNING)
        value_type = self.get_config_value("value_type")
        profile_options = ProfilerOptions()
        profile_options.structured_options.data_labeler.is_enabled = False
        profile_options.unstructured_options.data_labeler.is_enabled = False
        if value_type == "table":
            table_item: pa.Table = inputs.get_value_data("item")
            pd = table_item.to_pandas()
            profile = Profiler(
                pd, options=profile_options
            )  # Calculate Statistics, Entity Recognition, etc
            report = profile.report()
        elif value_type == "file":
            file_item: KiaraFile = inputs.get_value_data("item")
            data = Data(file_item.path)
            profile = Profiler(data, options=profile_options)
            report = profile.report()
        else:
            raise KiaraProcessingException(
                f"Data profiling of value type '{value_type}' not supported."
            )
        outputs.set_value("report", report)
create_input_schema(self)
¶
    Abstract method to implement by child classes, returns a description of the input schema of this module.
If returning a dictionary of dictionaries, the format of the return value is as follows (items with '*' are optional):
{
      "[input_field_name]: {
          "type": "[value_type]",
          "doc*": "[a description of this input]",
          "optional*': [boolean whether this input is optional or required (defaults to 'False')]
      "[other_input_field_name]: {
          "type: ...
          ...
      }
Source code in core/value.py
          def create_input_schema(
    self,
) -> typing.Mapping[
    str, typing.Union[ValueSchema, typing.Mapping[str, typing.Any]]
]:
    inputs: typing.Mapping[str, typing.Mapping[str, typing.Any]] = {
        "item": {
            "type": self.get_config_value("value_type"),
            "doc": f"The {self.get_config_value('value_type')} to profile.",
        }
    }
    return inputs
create_output_schema(self)
¶
    Abstract method to implement by child classes, returns a description of the output schema of this module.
If returning a dictionary of dictionaries, the format of the return value is as follows (items with '*' are optional):
{
      "[output_field_name]: {
          "type": "[value_type]",
          "doc*": "[a description of this output]"
      "[other_input_field_name]: {
          "type: ...
          ...
      }
Source code in core/value.py
          def create_output_schema(
    self,
) -> typing.Mapping[
    str, typing.Union[ValueSchema, typing.Mapping[str, typing.Any]]
]:
    outputs: typing.Mapping[str, typing.Mapping[str, typing.Any]] = {
        "report": {"type": "dict", "doc": "Statistics/details about the dataset."}
    }
    return outputs
retrieve_module_profiles(kiara)
  
      classmethod
  
¶
    Retrieve a collection of profiles (pre-set module configs) for this kiara module type.
This is used to automatically create generally useful operations (incl. their ids).
Source code in core/value.py
          @classmethod
def retrieve_module_profiles(
    cls, kiara: "Kiara"
) -> typing.Mapping[str, typing.Union[typing.Mapping[str, typing.Any], Operation]]:
    supported_source_types = ["table", "file"]
    doc = cls.get_type_metadata().documentation
    all_profiles = {}
    for sup_type in supported_source_types:
        op_config = {
            "module_type": cls._module_type_id,  # type: ignore
            "module_config": {"value_type": sup_type},
            "doc": doc,
        }
        all_profiles[f"profile.{sup_type}.data"] = op_config
    return all_profiles
        
DataProfilerModuleConfig            (ModuleTypeConfigSchema)
        
  
      pydantic-model
  
¶
    Source code in core/value.py
          class DataProfilerModuleConfig(ModuleTypeConfigSchema):
    value_type: str = Field(description="The value type to profile.")
value_type: str
  
      pydantic-field
      required
  
¶
    The value type to profile.