Skip to content

array

Attributes

FORCE_NON_NULL_DOC = "If set to 'True', raise an error if any of the strings in the array can't be parsed." module-attribute

MIN_INDEX_DOC = 'The minimum index from where to start parsing the string(s).' module-attribute

MAX_INDEX_DOC = 'The maximum index until whic to parse the string(s).' module-attribute

REMOVE_TOKENS_DOC = 'A list of tokens/characters to replace with a single white-space before parsing the input.' module-attribute

Classes

DeserializeArrayModule

Bases: DeserializeValueModule

Deserialize array data.

Source code in /opt/hostedtoolcache/Python/3.11.4/x64/lib/python3.11/site-packages/kiara_plugin/tabular/modules/array/__init__.py
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
class DeserializeArrayModule(DeserializeValueModule):
    """Deserialize array data."""

    _module_type_name = "load.array"

    @classmethod
    def retrieve_supported_target_profiles(cls) -> Mapping[str, Type]:
        return {"python_object": KiaraArray}

    @classmethod
    def retrieve_serialized_value_type(cls) -> str:
        return "array"

    @classmethod
    def retrieve_supported_serialization_profile(cls) -> str:
        return "feather"

    def to__python_object(self, data: SerializedData, **config: Any):

        assert "array.arrow" in data.get_keys() and len(list(data.get_keys())) == 1

        chunks = data.get_serialized_data("array.arrow")

        # TODO: support multiple chunks
        assert chunks.get_number_of_chunks() == 1
        files = list(chunks.get_chunks(as_files=True, symlink_ok=True))
        assert len(files) == 1

        array_file = files[0]

        array = KiaraArray(data_path=array_file)
        return array

Functions

retrieve_supported_target_profiles() -> Mapping[str, Type] classmethod
Source code in /opt/hostedtoolcache/Python/3.11.4/x64/lib/python3.11/site-packages/kiara_plugin/tabular/modules/array/__init__.py
20
21
22
@classmethod
def retrieve_supported_target_profiles(cls) -> Mapping[str, Type]:
    return {"python_object": KiaraArray}
retrieve_serialized_value_type() -> str classmethod
Source code in /opt/hostedtoolcache/Python/3.11.4/x64/lib/python3.11/site-packages/kiara_plugin/tabular/modules/array/__init__.py
24
25
26
@classmethod
def retrieve_serialized_value_type(cls) -> str:
    return "array"
retrieve_supported_serialization_profile() -> str classmethod
Source code in /opt/hostedtoolcache/Python/3.11.4/x64/lib/python3.11/site-packages/kiara_plugin/tabular/modules/array/__init__.py
28
29
30
@classmethod
def retrieve_supported_serialization_profile(cls) -> str:
    return "feather"
to__python_object(data: SerializedData, **config: Any)
Source code in /opt/hostedtoolcache/Python/3.11.4/x64/lib/python3.11/site-packages/kiara_plugin/tabular/modules/array/__init__.py
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
def to__python_object(self, data: SerializedData, **config: Any):

    assert "array.arrow" in data.get_keys() and len(list(data.get_keys())) == 1

    chunks = data.get_serialized_data("array.arrow")

    # TODO: support multiple chunks
    assert chunks.get_number_of_chunks() == 1
    files = list(chunks.get_chunks(as_files=True, symlink_ok=True))
    assert len(files) == 1

    array_file = files[0]

    array = KiaraArray(data_path=array_file)
    return array

ExtractDateConfig

Bases: KiaraInputsConfig

Source code in /opt/hostedtoolcache/Python/3.11.4/x64/lib/python3.11/site-packages/kiara_plugin/tabular/modules/array/__init__.py
55
56
57
58
59
60
61
62
63
64
65
class ExtractDateConfig(KiaraInputsConfig):

    force_non_null: bool = Field(description=FORCE_NON_NULL_DOC, default=True)
    min_index: Union[None, int] = Field(
        description=MIN_INDEX_DOC,
        default=None,
    )
    max_index: Union[None, int] = Field(description=MAX_INDEX_DOC, default=None)
    remove_tokens: List[str] = Field(
        description=REMOVE_TOKENS_DOC, default_factory=list
    )

Attributes

force_non_null: bool = Field(description=FORCE_NON_NULL_DOC, default=True) class-attribute instance-attribute
min_index: Union[None, int] = Field(description=MIN_INDEX_DOC, default=None) class-attribute instance-attribute
max_index: Union[None, int] = Field(description=MAX_INDEX_DOC, default=None) class-attribute instance-attribute
remove_tokens: List[str] = Field(description=REMOVE_TOKENS_DOC, default_factory=list) class-attribute instance-attribute

Functions

ExtractDateModule

Bases: AutoInputsKiaraModule

Create an array of date objects from an array of strings.

This module is very simplistic at the moment, more functionality and options will be added in the future.

At its core, this module uses the standard parser from the dateutil package to parse strings into dates. As this parser can't handle complex strings, the input strings can be pre-processed in the following ways:

  • 'cut' non-relevant parts of the string (using 'min_index' & 'max_index' input/config options)
  • remove matching tokens from the string, and replace them with a single whitespace (using the 'remove_tokens' option)

By default, if an input string can't be parsed this module will raise an exception. This can be prevented by setting this modules 'force_non_null' config option or input to 'False', in which case un-parsable strings will appear as 'NULL' value in the resulting array.

Source code in /opt/hostedtoolcache/Python/3.11.4/x64/lib/python3.11/site-packages/kiara_plugin/tabular/modules/array/__init__.py
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
class ExtractDateModule(AutoInputsKiaraModule):
    """Create an array of date objects from an array of strings.

    This module is very simplistic at the moment, more functionality and options will be added in the future.

    At its core, this module uses the standard parser from the
    [dateutil](https://github.com/dateutil/dateutil) package to parse strings into dates. As this parser can't handle
     complex strings, the input strings can be pre-processed in the following ways:

    - 'cut' non-relevant parts of the string (using 'min_index' & 'max_index' input/config options)
    - remove matching tokens from the string, and replace them with a single whitespace (using the 'remove_tokens' option)

    By default, if an input string can't be parsed this module will raise an exception. This can be prevented by
    setting this modules 'force_non_null' config option or input to 'False', in which case un-parsable strings
    will appear as 'NULL' value in the resulting array.
    """

    _module_type_name = "parse.date_array"
    _config_cls = ExtractDateConfig

    def create_inputs_schema(
        self,
    ) -> ValueMapSchema:

        inputs = {"array": {"type": "array", "doc": "The input array."}}
        return inputs

    def create_outputs_schema(
        self,
    ) -> ValueMapSchema:

        return {
            "date_array": {
                "type": "array",
                "doc": "The resulting array with items of a date data type.",
            }
        }

    def process(self, inputs: ValueMap, outputs: ValueMap, job_log: JobLog):

        import polars as pl
        import pyarrow as pa
        from dateutil import parser

        force_non_null: bool = self.get_data_for_field(
            field_name="force_non_null", inputs=inputs
        )
        min_pos: Union[None, int] = self.get_data_for_field(
            field_name="min_index", inputs=inputs
        )
        if min_pos is None:
            min_pos = 0
        max_pos: Union[None, int] = self.get_data_for_field(
            field_name="max_index", inputs=inputs
        )
        remove_tokens: Iterable[str] = self.get_data_for_field(
            field_name="remove_tokens", inputs=inputs
        )

        def parse_date(_text: str):

            text = _text
            if min_pos:
                try:
                    text = text[min_pos:]  # type: ignore
                except Exception:
                    return None
            if max_pos:
                try:
                    text = text[0 : max_pos - min_pos]  # type: ignore
                except Exception:
                    pass

            if remove_tokens:
                for t in remove_tokens:
                    text = text.replace(t, " ")

            try:
                d_obj = parser.parse(text, fuzzy=True)
            except Exception as e:
                if force_non_null:
                    raise KiaraProcessingException(e)
                return None

            if d_obj is None:
                if force_non_null:
                    raise KiaraProcessingException(
                        f"Can't parse date from string: {text}"
                    )
                return None

            return d_obj

        value = inputs.get_value_obj("array")
        array: KiaraArray = value.data

        series = pl.Series(name="tokens", values=array.arrow_array)
        job_log.add_log(f"start parsing date for {len(array)} items")
        result = series.apply(parse_date)
        job_log.add_log(f"finished parsing date for {len(array)} items")
        result_array = result.to_arrow()

        # TODO: remove this cast once the array data type can handle non-chunked arrays
        chunked = pa.chunked_array(result_array)
        outputs.set_values(date_array=chunked)

Attributes

_config_cls = ExtractDateConfig class-attribute instance-attribute

Functions

create_inputs_schema() -> ValueMapSchema
Source code in /opt/hostedtoolcache/Python/3.11.4/x64/lib/python3.11/site-packages/kiara_plugin/tabular/modules/array/__init__.py
88
89
90
91
92
93
def create_inputs_schema(
    self,
) -> ValueMapSchema:

    inputs = {"array": {"type": "array", "doc": "The input array."}}
    return inputs
create_outputs_schema() -> ValueMapSchema
Source code in /opt/hostedtoolcache/Python/3.11.4/x64/lib/python3.11/site-packages/kiara_plugin/tabular/modules/array/__init__.py
 95
 96
 97
 98
 99
100
101
102
103
104
def create_outputs_schema(
    self,
) -> ValueMapSchema:

    return {
        "date_array": {
            "type": "array",
            "doc": "The resulting array with items of a date data type.",
        }
    }
process(inputs: ValueMap, outputs: ValueMap, job_log: JobLog)
Source code in /opt/hostedtoolcache/Python/3.11.4/x64/lib/python3.11/site-packages/kiara_plugin/tabular/modules/array/__init__.py
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
def process(self, inputs: ValueMap, outputs: ValueMap, job_log: JobLog):

    import polars as pl
    import pyarrow as pa
    from dateutil import parser

    force_non_null: bool = self.get_data_for_field(
        field_name="force_non_null", inputs=inputs
    )
    min_pos: Union[None, int] = self.get_data_for_field(
        field_name="min_index", inputs=inputs
    )
    if min_pos is None:
        min_pos = 0
    max_pos: Union[None, int] = self.get_data_for_field(
        field_name="max_index", inputs=inputs
    )
    remove_tokens: Iterable[str] = self.get_data_for_field(
        field_name="remove_tokens", inputs=inputs
    )

    def parse_date(_text: str):

        text = _text
        if min_pos:
            try:
                text = text[min_pos:]  # type: ignore
            except Exception:
                return None
        if max_pos:
            try:
                text = text[0 : max_pos - min_pos]  # type: ignore
            except Exception:
                pass

        if remove_tokens:
            for t in remove_tokens:
                text = text.replace(t, " ")

        try:
            d_obj = parser.parse(text, fuzzy=True)
        except Exception as e:
            if force_non_null:
                raise KiaraProcessingException(e)
            return None

        if d_obj is None:
            if force_non_null:
                raise KiaraProcessingException(
                    f"Can't parse date from string: {text}"
                )
            return None

        return d_obj

    value = inputs.get_value_obj("array")
    array: KiaraArray = value.data

    series = pl.Series(name="tokens", values=array.arrow_array)
    job_log.add_log(f"start parsing date for {len(array)} items")
    result = series.apply(parse_date)
    job_log.add_log(f"finished parsing date for {len(array)} items")
    result_array = result.to_arrow()

    # TODO: remove this cast once the array data type can handle non-chunked arrays
    chunked = pa.chunked_array(result_array)
    outputs.set_values(date_array=chunked)