Bases: AutoInputsKiaraModule
  
      Create an array of date objects from an array of strings.
This module is very simplistic at the moment, more functionality and options will be added in the future.
At its core, this module uses the standard parser from the
dateutil package to parse strings into dates. As this parser can't handle
 complex strings, the input strings can be pre-processed in the following ways:
- 'cut' non-relevant parts of the string (using 'min_index' & 'max_index' input/config options)
 
- remove matching tokens from the string, and replace them with a single whitespace (using the 'remove_tokens' option)
 
By default, if an input string can't be parsed this module will raise an exception. This can be prevented by
setting this modules 'force_non_null' config option or input to 'False', in which case un-parsable strings
will appear as 'NULL' value in the resulting array.
        
          Source code in /opt/hostedtoolcache/Python/3.11.4/x64/lib/python3.11/site-packages/kiara_plugin/tabular/modules/array/__init__.py
           68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172  | class ExtractDateModule(AutoInputsKiaraModule):
    """Create an array of date objects from an array of strings.
    This module is very simplistic at the moment, more functionality and options will be added in the future.
    At its core, this module uses the standard parser from the
    [dateutil](https://github.com/dateutil/dateutil) package to parse strings into dates. As this parser can't handle
     complex strings, the input strings can be pre-processed in the following ways:
    - 'cut' non-relevant parts of the string (using 'min_index' & 'max_index' input/config options)
    - remove matching tokens from the string, and replace them with a single whitespace (using the 'remove_tokens' option)
    By default, if an input string can't be parsed this module will raise an exception. This can be prevented by
    setting this modules 'force_non_null' config option or input to 'False', in which case un-parsable strings
    will appear as 'NULL' value in the resulting array.
    """
    _module_type_name = "parse.date_array"
    _config_cls = ExtractDateConfig
    def create_inputs_schema(
        self,
    ) -> ValueMapSchema:
        inputs = {"array": {"type": "array", "doc": "The input array."}}
        return inputs
    def create_outputs_schema(
        self,
    ) -> ValueMapSchema:
        return {
            "date_array": {
                "type": "array",
                "doc": "The resulting array with items of a date data type.",
            }
        }
    def process(self, inputs: ValueMap, outputs: ValueMap, job_log: JobLog):
        import polars as pl
        import pyarrow as pa
        from dateutil import parser
        force_non_null: bool = self.get_data_for_field(
            field_name="force_non_null", inputs=inputs
        )
        min_pos: Union[None, int] = self.get_data_for_field(
            field_name="min_index", inputs=inputs
        )
        if min_pos is None:
            min_pos = 0
        max_pos: Union[None, int] = self.get_data_for_field(
            field_name="max_index", inputs=inputs
        )
        remove_tokens: Iterable[str] = self.get_data_for_field(
            field_name="remove_tokens", inputs=inputs
        )
        def parse_date(_text: str):
            text = _text
            if min_pos:
                try:
                    text = text[min_pos:]  # type: ignore
                except Exception:
                    return None
            if max_pos:
                try:
                    text = text[0 : max_pos - min_pos]  # type: ignore
                except Exception:
                    pass
            if remove_tokens:
                for t in remove_tokens:
                    text = text.replace(t, " ")
            try:
                d_obj = parser.parse(text, fuzzy=True)
            except Exception as e:
                if force_non_null:
                    raise KiaraProcessingException(e)
                return None
            if d_obj is None:
                if force_non_null:
                    raise KiaraProcessingException(
                        f"Can't parse date from string: {text}"
                    )
                return None
            return d_obj
        value = inputs.get_value_obj("array")
        array: KiaraArray = value.data
        series = pl.Series(name="tokens", values=array.arrow_array)
        job_log.add_log(f"start parsing date for {len(array)} items")
        result = series.apply(parse_date)
        job_log.add_log(f"finished parsing date for {len(array)} items")
        result_array = result.to_arrow()
        # TODO: remove this cast once the array data type can handle non-chunked arrays
        chunked = pa.chunked_array(result_array)
        outputs.set_values(date_array=chunked)
  | 
 
         
  
  
  
      
        Source code in /opt/hostedtoolcache/Python/3.11.4/x64/lib/python3.11/site-packages/kiara_plugin/tabular/modules/array/__init__.py
         | def create_inputs_schema(
    self,
) -> ValueMapSchema:
    inputs = {"array": {"type": "array", "doc": "The input array."}}
    return inputs
  | 
 
       
   
 
  
      
        Source code in /opt/hostedtoolcache/Python/3.11.4/x64/lib/python3.11/site-packages/kiara_plugin/tabular/modules/array/__init__.py
         95
 96
 97
 98
 99
100
101
102
103
104  | def create_outputs_schema(
    self,
) -> ValueMapSchema:
    return {
        "date_array": {
            "type": "array",
            "doc": "The resulting array with items of a date data type.",
        }
    }
  | 
 
       
   
 
  
      
        Source code in /opt/hostedtoolcache/Python/3.11.4/x64/lib/python3.11/site-packages/kiara_plugin/tabular/modules/array/__init__.py
        106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172  | def process(self, inputs: ValueMap, outputs: ValueMap, job_log: JobLog):
    import polars as pl
    import pyarrow as pa
    from dateutil import parser
    force_non_null: bool = self.get_data_for_field(
        field_name="force_non_null", inputs=inputs
    )
    min_pos: Union[None, int] = self.get_data_for_field(
        field_name="min_index", inputs=inputs
    )
    if min_pos is None:
        min_pos = 0
    max_pos: Union[None, int] = self.get_data_for_field(
        field_name="max_index", inputs=inputs
    )
    remove_tokens: Iterable[str] = self.get_data_for_field(
        field_name="remove_tokens", inputs=inputs
    )
    def parse_date(_text: str):
        text = _text
        if min_pos:
            try:
                text = text[min_pos:]  # type: ignore
            except Exception:
                return None
        if max_pos:
            try:
                text = text[0 : max_pos - min_pos]  # type: ignore
            except Exception:
                pass
        if remove_tokens:
            for t in remove_tokens:
                text = text.replace(t, " ")
        try:
            d_obj = parser.parse(text, fuzzy=True)
        except Exception as e:
            if force_non_null:
                raise KiaraProcessingException(e)
            return None
        if d_obj is None:
            if force_non_null:
                raise KiaraProcessingException(
                    f"Can't parse date from string: {text}"
                )
            return None
        return d_obj
    value = inputs.get_value_obj("array")
    array: KiaraArray = value.data
    series = pl.Series(name="tokens", values=array.arrow_array)
    job_log.add_log(f"start parsing date for {len(array)} items")
    result = series.apply(parse_date)
    job_log.add_log(f"finished parsing date for {len(array)} items")
    result_array = result.to_arrow()
    # TODO: remove this cast once the array data type can handle non-chunked arrays
    chunked = pa.chunked_array(result_array)
    outputs.set_values(date_array=chunked)
  |