Skip to content

table

Attributes

EMPTY_COLUMN_NAME_MARKER = '__no_column_name__' module-attribute

Classes

CreateTableModuleConfig

Bases: CreateFromModuleConfig

Source code in /opt/hostedtoolcache/Python/3.11.4/x64/lib/python3.11/site-packages/kiara_plugin/tabular/modules/table/__init__.py
38
39
40
41
42
43
class CreateTableModuleConfig(CreateFromModuleConfig):

    ignore_errors: bool = Field(
        description="Whether to ignore convert errors and omit the failed items.",
        default=False,
    )

Attributes

ignore_errors: bool = Field(description='Whether to ignore convert errors and omit the failed items.', default=False) class-attribute instance-attribute

CreateTableModule

Bases: CreateFromModule

Source code in /opt/hostedtoolcache/Python/3.11.4/x64/lib/python3.11/site-packages/kiara_plugin/tabular/modules/table/__init__.py
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
class CreateTableModule(CreateFromModule):

    _module_type_name = "create.table"
    _config_cls = CreateTableModuleConfig

    def create_optional_inputs(
        self, source_type: str, target_type
    ) -> Union[Mapping[str, Mapping[str, Any]], None]:

        if source_type == "file":
            return {
                "first_row_is_header": {
                    "type": "boolean",
                    "optional": True,
                    "doc": "Whether the first row of the file is a header row. If not provided, kiara will try to auto-determine.",
                }
            }

        return None

    def create__table__from__file(self, source_value: Value, optional: ValueMap) -> Any:
        """Create a table from a file, trying to auto-determine the format of said file."""

        import csv as py_csv

        from pyarrow import csv

        input_file: KiaraFile = source_value.data
        imported_data = None
        errors = []

        has_header = optional.get_value_data("first_row_is_header")
        if has_header is None:
            try:
                has_header = True
                with open(input_file.path, "rt") as csvfile:
                    sniffer = py_csv.Sniffer()
                    has_header = sniffer.has_header(csvfile.read(2048))
                    csvfile.seek(0)
            except Exception as e:
                # TODO: add this to the procss log
                log_message(
                    "csv_sniffer.error",
                    file=input_file.path,
                    error=str(e),
                    details="assuming csv file has header",
                )

        try:
            if has_header:
                imported_data = csv.read_csv(input_file.path)
            else:
                read_options = csv.ReadOptions(autogenerate_column_names=True)
                imported_data = csv.read_csv(input_file.path, read_options=read_options)
        except Exception as e:
            errors.append(e)

        if imported_data is None:
            raise KiaraProcessingException(
                f"Failed to import file '{input_file.path}'."
            )

        # import pandas as pd
        # df = pd.read_csv(input_file.path)
        # imported_data = pa.Table.from_pandas(df)

        return KiaraTable.create_table(imported_data)

    # def create__table__from__csv_file(self, source_value: Value) -> Any:
    #     """Create a table from a csv_file value."""
    #
    #     from pyarrow import csv
    #
    #     input_file: FileModel = source_value.data
    #     imported_data = csv.read_csv(input_file.path)
    #
    #     # import pandas as pd
    #     # df = pd.read_csv(input_file.path)
    #     # imported_data = pa.Table.from_pandas(df)
    #
    #     return KiaraTable.create_table(imported_data)

    def create__table__from__file_bundle(self, source_value: Value) -> Any:
        """Create a table value from a text file_bundle.

        The resulting table will have (at a minimum) the following columns:
        - id: an auto-assigned index
        - rel_path: the relative path of the file (from the provided base path)
        - content: the text file content
        """

        import pyarrow as pa

        bundle: KiaraFileBundle = source_value.data

        columns = FILE_BUNDLE_IMPORT_AVAILABLE_COLUMNS

        ignore_errors = self.get_config_value("ignore_errors")
        file_dict = bundle.read_text_file_contents(ignore_errors=ignore_errors)

        # TODO: use chunks to save on memory
        tabular: Dict[str, List[Any]] = {}
        for column in columns:
            for index, rel_path in enumerate(sorted(file_dict.keys())):

                if column == "content":
                    _value: Any = file_dict[rel_path]
                elif column == "id":
                    _value = index
                elif column == "rel_path":
                    _value = rel_path
                else:
                    file_model = bundle.included_files[rel_path]
                    _value = getattr(file_model, column)

                tabular.setdefault(column, []).append(_value)

        table = pa.Table.from_pydict(tabular)
        return KiaraTable.create_table(table)

Attributes

_config_cls = CreateTableModuleConfig class-attribute instance-attribute

Functions

create_optional_inputs(source_type: str, target_type: str) -> Union[Mapping[str, Mapping[str, Any]], None]
Source code in /opt/hostedtoolcache/Python/3.11.4/x64/lib/python3.11/site-packages/kiara_plugin/tabular/modules/table/__init__.py
51
52
53
54
55
56
57
58
59
60
61
62
63
64
def create_optional_inputs(
    self, source_type: str, target_type
) -> Union[Mapping[str, Mapping[str, Any]], None]:

    if source_type == "file":
        return {
            "first_row_is_header": {
                "type": "boolean",
                "optional": True,
                "doc": "Whether the first row of the file is a header row. If not provided, kiara will try to auto-determine.",
            }
        }

    return None
create__table__from__file(source_value: Value, optional: ValueMap) -> Any

Create a table from a file, trying to auto-determine the format of said file.

Source code in /opt/hostedtoolcache/Python/3.11.4/x64/lib/python3.11/site-packages/kiara_plugin/tabular/modules/table/__init__.py
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
def create__table__from__file(self, source_value: Value, optional: ValueMap) -> Any:
    """Create a table from a file, trying to auto-determine the format of said file."""

    import csv as py_csv

    from pyarrow import csv

    input_file: KiaraFile = source_value.data
    imported_data = None
    errors = []

    has_header = optional.get_value_data("first_row_is_header")
    if has_header is None:
        try:
            has_header = True
            with open(input_file.path, "rt") as csvfile:
                sniffer = py_csv.Sniffer()
                has_header = sniffer.has_header(csvfile.read(2048))
                csvfile.seek(0)
        except Exception as e:
            # TODO: add this to the procss log
            log_message(
                "csv_sniffer.error",
                file=input_file.path,
                error=str(e),
                details="assuming csv file has header",
            )

    try:
        if has_header:
            imported_data = csv.read_csv(input_file.path)
        else:
            read_options = csv.ReadOptions(autogenerate_column_names=True)
            imported_data = csv.read_csv(input_file.path, read_options=read_options)
    except Exception as e:
        errors.append(e)

    if imported_data is None:
        raise KiaraProcessingException(
            f"Failed to import file '{input_file.path}'."
        )

    # import pandas as pd
    # df = pd.read_csv(input_file.path)
    # imported_data = pa.Table.from_pandas(df)

    return KiaraTable.create_table(imported_data)
create__table__from__file_bundle(source_value: Value) -> Any

Create a table value from a text file_bundle.

The resulting table will have (at a minimum) the following columns: - id: an auto-assigned index - rel_path: the relative path of the file (from the provided base path) - content: the text file content

Source code in /opt/hostedtoolcache/Python/3.11.4/x64/lib/python3.11/site-packages/kiara_plugin/tabular/modules/table/__init__.py
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
def create__table__from__file_bundle(self, source_value: Value) -> Any:
    """Create a table value from a text file_bundle.

    The resulting table will have (at a minimum) the following columns:
    - id: an auto-assigned index
    - rel_path: the relative path of the file (from the provided base path)
    - content: the text file content
    """

    import pyarrow as pa

    bundle: KiaraFileBundle = source_value.data

    columns = FILE_BUNDLE_IMPORT_AVAILABLE_COLUMNS

    ignore_errors = self.get_config_value("ignore_errors")
    file_dict = bundle.read_text_file_contents(ignore_errors=ignore_errors)

    # TODO: use chunks to save on memory
    tabular: Dict[str, List[Any]] = {}
    for column in columns:
        for index, rel_path in enumerate(sorted(file_dict.keys())):

            if column == "content":
                _value: Any = file_dict[rel_path]
            elif column == "id":
                _value = index
            elif column == "rel_path":
                _value = rel_path
            else:
                file_model = bundle.included_files[rel_path]
                _value = getattr(file_model, column)

            tabular.setdefault(column, []).append(_value)

    table = pa.Table.from_pydict(tabular)
    return KiaraTable.create_table(table)

DeserializeTableModule

Bases: DeserializeValueModule

Source code in /opt/hostedtoolcache/Python/3.11.4/x64/lib/python3.11/site-packages/kiara_plugin/tabular/modules/table/__init__.py
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
class DeserializeTableModule(DeserializeValueModule):

    _module_type_name = "load.table"

    @classmethod
    def retrieve_supported_target_profiles(cls) -> Mapping[str, Type]:
        return {"python_object": KiaraTable}

    @classmethod
    def retrieve_serialized_value_type(cls) -> str:
        return "table"

    @classmethod
    def retrieve_supported_serialization_profile(cls) -> str:
        return "feather"

    def to__python_object(self, data: SerializedData, **config: Any):

        import pyarrow as pa

        columns = {}

        table_schema_chunks = data.get_serialized_data(TABLE_SCHEMA_CHUNKS_NAME)
        chunks_generator = table_schema_chunks.get_chunks(as_files=False)
        schema_chunk = next(chunks_generator)  # type: ignore
        schema = pa.ipc.read_schema(pa.py_buffer(schema_chunk))

        for column_name in data.get_keys():

            if column_name == TABLE_SCHEMA_CHUNKS_NAME:
                continue

            chunks = data.get_serialized_data(column_name)

            # TODO: support multiple chunks
            assert chunks.get_number_of_chunks() == 1
            files = list(chunks.get_chunks(as_files=True, symlink_ok=True))
            assert len(files) == 1

            file = files[0]
            with pa.memory_map(file, "r") as column_chunk:
                loaded_arrays: pa.Table = pa.ipc.open_file(column_chunk).read_all()
                column = loaded_arrays.column(column_name)
                if column_name == EMPTY_COLUMN_NAME_MARKER:
                    columns[""] = column
                else:
                    columns[column_name] = column

        arrow_table = pa.table(columns, schema=schema)

        table = KiaraTable.create_table(arrow_table)
        return table

Functions

retrieve_supported_target_profiles() -> Mapping[str, Type] classmethod
Source code in /opt/hostedtoolcache/Python/3.11.4/x64/lib/python3.11/site-packages/kiara_plugin/tabular/modules/table/__init__.py
171
172
173
@classmethod
def retrieve_supported_target_profiles(cls) -> Mapping[str, Type]:
    return {"python_object": KiaraTable}
retrieve_serialized_value_type() -> str classmethod
Source code in /opt/hostedtoolcache/Python/3.11.4/x64/lib/python3.11/site-packages/kiara_plugin/tabular/modules/table/__init__.py
175
176
177
@classmethod
def retrieve_serialized_value_type(cls) -> str:
    return "table"
retrieve_supported_serialization_profile() -> str classmethod
Source code in /opt/hostedtoolcache/Python/3.11.4/x64/lib/python3.11/site-packages/kiara_plugin/tabular/modules/table/__init__.py
179
180
181
@classmethod
def retrieve_supported_serialization_profile(cls) -> str:
    return "feather"
to__python_object(data: SerializedData, **config: Any)
Source code in /opt/hostedtoolcache/Python/3.11.4/x64/lib/python3.11/site-packages/kiara_plugin/tabular/modules/table/__init__.py
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
def to__python_object(self, data: SerializedData, **config: Any):

    import pyarrow as pa

    columns = {}

    table_schema_chunks = data.get_serialized_data(TABLE_SCHEMA_CHUNKS_NAME)
    chunks_generator = table_schema_chunks.get_chunks(as_files=False)
    schema_chunk = next(chunks_generator)  # type: ignore
    schema = pa.ipc.read_schema(pa.py_buffer(schema_chunk))

    for column_name in data.get_keys():

        if column_name == TABLE_SCHEMA_CHUNKS_NAME:
            continue

        chunks = data.get_serialized_data(column_name)

        # TODO: support multiple chunks
        assert chunks.get_number_of_chunks() == 1
        files = list(chunks.get_chunks(as_files=True, symlink_ok=True))
        assert len(files) == 1

        file = files[0]
        with pa.memory_map(file, "r") as column_chunk:
            loaded_arrays: pa.Table = pa.ipc.open_file(column_chunk).read_all()
            column = loaded_arrays.column(column_name)
            if column_name == EMPTY_COLUMN_NAME_MARKER:
                columns[""] = column
            else:
                columns[column_name] = column

    arrow_table = pa.table(columns, schema=schema)

    table = KiaraTable.create_table(arrow_table)
    return table

PickColumnModuleConfig

Bases: KiaraModuleConfig

Configuration for the 'table.cut_column' kiara module.

Technically this is not necessary, because we could just use the 'constants' field to set the 'column_name'. But this module is used in the documentation as example, as it's easy enough to understand, and I wanted to show how implement kiara module configuration.

Source code in /opt/hostedtoolcache/Python/3.11.4/x64/lib/python3.11/site-packages/kiara_plugin/tabular/modules/table/__init__.py
221
222
223
224
225
226
227
228
229
230
231
class PickColumnModuleConfig(KiaraModuleConfig):
    """Configuration for the 'table.cut_column' kiara module.

    Technically this is not necessary, because we could just use the 'constants' field to
    set the 'column_name'. But this module is used in the documentation as example, as it's easy enough to understand,
    and I wanted to show how implement kiara module configuration.
    """

    column_name: Union[str, None] = Field(
        description="A hardcoded column name to cut.", default=None
    )

Attributes

column_name: Union[str, None] = Field(description='A hardcoded column name to cut.', default=None) class-attribute instance-attribute

PickColumnModule

Bases: KiaraModule

Pick one column from a table, returning an array.

Source code in /opt/hostedtoolcache/Python/3.11.4/x64/lib/python3.11/site-packages/kiara_plugin/tabular/modules/table/__init__.py
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
class PickColumnModule(KiaraModule):
    """Pick one column from a table, returning an array."""

    _module_type_name = "table.pick.column"
    _config_cls = PickColumnModuleConfig

    def create_inputs_schema(
        self,
    ) -> ValueMapSchema:

        inputs: Dict[str, Any] = {"table": {"type": "table", "doc": "A table."}}
        column_name = self.get_config_value("column_name")
        if not column_name:
            inputs["column_name"] = {
                "type": "string",
                "doc": "The name of the column to extract.",
            }

        return inputs

    def create_outputs_schema(
        self,
    ) -> ValueMapSchema:

        outputs: Mapping[str, Any] = {"array": {"type": "array", "doc": "The column."}}
        return outputs

    def process(self, inputs: ValueMap, outputs: ValueMap) -> None:

        import pyarrow as pa

        column_name: Union[str, None] = self.get_config_value("column_name")
        if not column_name:
            column_name = inputs.get_value_data("column_name")

        if not column_name:
            raise KiaraProcessingException(
                "Could not cut column from table: column_name not provided or empty string."
            )

        table_value: Value = inputs.get_value_obj("table")
        table_metadata: KiaraTableMetadata = table_value.get_property_data(
            "metadata.table"
        )
        available = table_metadata.table.column_names

        if column_name not in available:
            raise KiaraProcessingException(
                f"Invalid column name '{column_name}'. Available column names: {', '.join(available)}"
            )

        table: pa.Table = table_value.data.arrow_table
        column = table.column(column_name)

        outputs.set_value("array", column)

Attributes

_config_cls = PickColumnModuleConfig class-attribute instance-attribute

Functions

create_inputs_schema() -> ValueMapSchema
Source code in /opt/hostedtoolcache/Python/3.11.4/x64/lib/python3.11/site-packages/kiara_plugin/tabular/modules/table/__init__.py
240
241
242
243
244
245
246
247
248
249
250
251
252
def create_inputs_schema(
    self,
) -> ValueMapSchema:

    inputs: Dict[str, Any] = {"table": {"type": "table", "doc": "A table."}}
    column_name = self.get_config_value("column_name")
    if not column_name:
        inputs["column_name"] = {
            "type": "string",
            "doc": "The name of the column to extract.",
        }

    return inputs
create_outputs_schema() -> ValueMapSchema
Source code in /opt/hostedtoolcache/Python/3.11.4/x64/lib/python3.11/site-packages/kiara_plugin/tabular/modules/table/__init__.py
254
255
256
257
258
259
def create_outputs_schema(
    self,
) -> ValueMapSchema:

    outputs: Mapping[str, Any] = {"array": {"type": "array", "doc": "The column."}}
    return outputs
process(inputs: ValueMap, outputs: ValueMap) -> None
Source code in /opt/hostedtoolcache/Python/3.11.4/x64/lib/python3.11/site-packages/kiara_plugin/tabular/modules/table/__init__.py
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
def process(self, inputs: ValueMap, outputs: ValueMap) -> None:

    import pyarrow as pa

    column_name: Union[str, None] = self.get_config_value("column_name")
    if not column_name:
        column_name = inputs.get_value_data("column_name")

    if not column_name:
        raise KiaraProcessingException(
            "Could not cut column from table: column_name not provided or empty string."
        )

    table_value: Value = inputs.get_value_obj("table")
    table_metadata: KiaraTableMetadata = table_value.get_property_data(
        "metadata.table"
    )
    available = table_metadata.table.column_names

    if column_name not in available:
        raise KiaraProcessingException(
            f"Invalid column name '{column_name}'. Available column names: {', '.join(available)}"
        )

    table: pa.Table = table_value.data.arrow_table
    column = table.column(column_name)

    outputs.set_value("array", column)

MergeTableConfig

Bases: KiaraModuleConfig

Source code in /opt/hostedtoolcache/Python/3.11.4/x64/lib/python3.11/site-packages/kiara_plugin/tabular/modules/table/__init__.py
291
292
293
294
295
296
297
298
class MergeTableConfig(KiaraModuleConfig):

    inputs_schema: Dict[str, ValueSchema] = Field(
        description="A dict describing the inputs for this merge process."
    )
    column_map: Dict[str, str] = Field(
        description="A map describing", default_factory=dict
    )

Attributes

inputs_schema: Dict[str, ValueSchema] = Field(description='A dict describing the inputs for this merge process.') class-attribute instance-attribute
column_map: Dict[str, str] = Field(description='A map describing', default_factory=dict) class-attribute instance-attribute

MergeTableModule

Bases: KiaraModule

Create a table from other tables and/or arrays.

This module needs configuration to be set (for now). It's currently not possible to merge an arbitrary number of tables/arrays, all tables to be merged must be specified in the module configuration.

Column names of the resulting table can be controlled by the 'column_map' configuration, which takes the desired column name as key, and a field-name in the following format as value: - '[inputs_schema key]' for inputs of type 'array' - '[inputs_schema_key].orig_column_name' for inputs of type 'table'

Source code in /opt/hostedtoolcache/Python/3.11.4/x64/lib/python3.11/site-packages/kiara_plugin/tabular/modules/table/__init__.py
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
class MergeTableModule(KiaraModule):
    """Create a table from other tables and/or arrays.

    This module needs configuration to be set (for now). It's currently not possible to merge an arbitrary
    number of tables/arrays, all tables to be merged must be specified in the module configuration.

    Column names of the resulting table can be controlled by the 'column_map' configuration, which takes the
    desired column name as key, and a field-name in the following format as value:
    - '[inputs_schema key]' for inputs of type 'array'
    - '[inputs_schema_key].orig_column_name' for inputs of type 'table'
    """

    _module_type_name = "table.merge"
    _config_cls = MergeTableConfig

    def create_inputs_schema(
        self,
    ) -> ValueMapSchema:

        input_schema_dict = self.get_config_value("inputs_schema")
        return input_schema_dict

    def create_outputs_schema(
        self,
    ) -> ValueMapSchema:

        outputs = {
            "table": {
                "type": "table",
                "doc": "The merged table, including all source tables and columns.",
            }
        }
        return outputs

    def process(self, inputs: ValueMap, outputs: ValueMap, job_log: JobLog) -> None:

        import pyarrow as pa

        inputs_schema: Dict[str, Any] = self.get_config_value("inputs_schema")
        column_map: Dict[str, str] = self.get_config_value("column_map")

        sources = {}
        for field_name in inputs_schema.keys():
            sources[field_name] = inputs.get_value_data(field_name)

        len_dict = {}
        arrays = {}

        column_map_final = dict(column_map)

        for source_key, table_or_array in sources.items():

            if isinstance(table_or_array, KiaraTable):
                rows = table_or_array.num_rows
                for name in table_or_array.column_names:
                    array_name = f"{source_key}.{name}"
                    if column_map and array_name not in column_map.values():
                        job_log.add_log(
                            f"Ignoring column '{name}' of input table '{source_key}': not listed in column_map."
                        )
                        continue

                    column = table_or_array.arrow_table.column(name)
                    arrays[array_name] = column
                    if not column_map:
                        if name in column_map_final:
                            raise Exception(
                                f"Can't merge table, duplicate column name: {name}."
                            )
                        column_map_final[name] = array_name

            elif isinstance(table_or_array, KiaraArray):

                if column_map and source_key not in column_map.values():
                    job_log.add_log(
                        f"Ignoring array '{source_key}': not listed in column_map."
                    )
                    continue

                rows = len(table_or_array)
                arrays[source_key] = table_or_array.arrow_array

                if not column_map:
                    if source_key in column_map_final.keys():
                        raise Exception(
                            f"Can't merge table, duplicate column name: {source_key}."
                        )
                    column_map_final[source_key] = source_key

            else:
                raise KiaraProcessingException(
                    f"Can't merge table: invalid type '{type(table_or_array)}' for source '{source_key}'."
                )

            len_dict[source_key] = rows

        all_rows = None
        for source_key, rows in len_dict.items():
            if all_rows is None:
                all_rows = rows
            elif all_rows != rows:
                all_rows = None
                break

        if all_rows is None:
            len_str = ""
            for name, rows in len_dict.items():
                len_str = f" {name} ({rows})"

            raise KiaraProcessingException(
                f"Can't merge table, sources have different lengths: {len_str}"
            )

        column_names = []
        columns = []
        for column_name, ref in column_map_final.items():
            column_names.append(column_name)
            column = arrays[ref]
            columns.append(column)

        table = pa.Table.from_arrays(arrays=columns, names=column_names)

        outputs.set_value("table", table)

Attributes

_config_cls = MergeTableConfig class-attribute instance-attribute

Functions

create_inputs_schema() -> ValueMapSchema
Source code in /opt/hostedtoolcache/Python/3.11.4/x64/lib/python3.11/site-packages/kiara_plugin/tabular/modules/table/__init__.py
316
317
318
319
320
321
def create_inputs_schema(
    self,
) -> ValueMapSchema:

    input_schema_dict = self.get_config_value("inputs_schema")
    return input_schema_dict
create_outputs_schema() -> ValueMapSchema
Source code in /opt/hostedtoolcache/Python/3.11.4/x64/lib/python3.11/site-packages/kiara_plugin/tabular/modules/table/__init__.py
323
324
325
326
327
328
329
330
331
332
333
def create_outputs_schema(
    self,
) -> ValueMapSchema:

    outputs = {
        "table": {
            "type": "table",
            "doc": "The merged table, including all source tables and columns.",
        }
    }
    return outputs
process(inputs: ValueMap, outputs: ValueMap, job_log: JobLog) -> None
Source code in /opt/hostedtoolcache/Python/3.11.4/x64/lib/python3.11/site-packages/kiara_plugin/tabular/modules/table/__init__.py
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
def process(self, inputs: ValueMap, outputs: ValueMap, job_log: JobLog) -> None:

    import pyarrow as pa

    inputs_schema: Dict[str, Any] = self.get_config_value("inputs_schema")
    column_map: Dict[str, str] = self.get_config_value("column_map")

    sources = {}
    for field_name in inputs_schema.keys():
        sources[field_name] = inputs.get_value_data(field_name)

    len_dict = {}
    arrays = {}

    column_map_final = dict(column_map)

    for source_key, table_or_array in sources.items():

        if isinstance(table_or_array, KiaraTable):
            rows = table_or_array.num_rows
            for name in table_or_array.column_names:
                array_name = f"{source_key}.{name}"
                if column_map and array_name not in column_map.values():
                    job_log.add_log(
                        f"Ignoring column '{name}' of input table '{source_key}': not listed in column_map."
                    )
                    continue

                column = table_or_array.arrow_table.column(name)
                arrays[array_name] = column
                if not column_map:
                    if name in column_map_final:
                        raise Exception(
                            f"Can't merge table, duplicate column name: {name}."
                        )
                    column_map_final[name] = array_name

        elif isinstance(table_or_array, KiaraArray):

            if column_map and source_key not in column_map.values():
                job_log.add_log(
                    f"Ignoring array '{source_key}': not listed in column_map."
                )
                continue

            rows = len(table_or_array)
            arrays[source_key] = table_or_array.arrow_array

            if not column_map:
                if source_key in column_map_final.keys():
                    raise Exception(
                        f"Can't merge table, duplicate column name: {source_key}."
                    )
                column_map_final[source_key] = source_key

        else:
            raise KiaraProcessingException(
                f"Can't merge table: invalid type '{type(table_or_array)}' for source '{source_key}'."
            )

        len_dict[source_key] = rows

    all_rows = None
    for source_key, rows in len_dict.items():
        if all_rows is None:
            all_rows = rows
        elif all_rows != rows:
            all_rows = None
            break

    if all_rows is None:
        len_str = ""
        for name, rows in len_dict.items():
            len_str = f" {name} ({rows})"

        raise KiaraProcessingException(
            f"Can't merge table, sources have different lengths: {len_str}"
        )

    column_names = []
    columns = []
    for column_name, ref in column_map_final.items():
        column_names.append(column_name)
        column = arrays[ref]
        columns.append(column)

    table = pa.Table.from_arrays(arrays=columns, names=column_names)

    outputs.set_value("table", table)

QueryTableSQLModuleConfig

Bases: KiaraModuleConfig

Source code in /opt/hostedtoolcache/Python/3.11.4/x64/lib/python3.11/site-packages/kiara_plugin/tabular/modules/table/__init__.py
426
427
428
429
430
431
432
433
434
435
class QueryTableSQLModuleConfig(KiaraModuleConfig):

    query: Union[str, None] = Field(
        description="The query to execute. If not specified, the user will be able to provide their own.",
        default=None,
    )
    relation_name: Union[str, None] = Field(
        description="The name the table is referred to in the sql query. If not specified, the user will be able to provide their own.",
        default="data",
    )

Attributes

query: Union[str, None] = Field(description='The query to execute. If not specified, the user will be able to provide their own.', default=None) class-attribute instance-attribute
relation_name: Union[str, None] = Field(description='The name the table is referred to in the sql query. If not specified, the user will be able to provide their own.', default='data') class-attribute instance-attribute

QueryTableSQL

Bases: KiaraModule

Execute a sql query against an (Arrow) table.

The default relation name for the sql query is 'data', but can be modified by the 'relation_name' config option/input.

If the 'query' module config option is not set, users can provide their own query, otherwise the pre-set one will be used.

Source code in /opt/hostedtoolcache/Python/3.11.4/x64/lib/python3.11/site-packages/kiara_plugin/tabular/modules/table/__init__.py
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
class QueryTableSQL(KiaraModule):
    """Execute a sql query against an (Arrow) table.

    The default relation name for the sql query is 'data', but can be modified by the 'relation_name' config option/input.

    If the 'query' module config option is not set, users can provide their own query, otherwise the pre-set
    one will be used.
    """

    _module_type_name = "query.table"
    _config_cls = QueryTableSQLModuleConfig

    def create_inputs_schema(
        self,
    ) -> ValueMapSchema:

        inputs = {
            "table": {
                "type": "table",
                "doc": "The table to query",
            }
        }

        if self.get_config_value("query") is None:
            inputs["query"] = {
                "type": "string",
                "doc": "The query, use the value of the 'relation_name' input as table, e.g. 'select * from data'.",
            }
            inputs["relation_name"] = {
                "type": "string",
                "doc": "The name the table is referred to in the sql query.",
                "default": "data",
            }

        return inputs

    def create_outputs_schema(
        self,
    ) -> ValueMapSchema:

        return {"query_result": {"type": "table", "doc": "The query result."}}

    def process(self, inputs: ValueMap, outputs: ValueMap) -> None:

        import duckdb

        if self.get_config_value("query") is None:
            _query: str = inputs.get_value_data("query")
            _relation_name: str = inputs.get_value_data("relation_name")
        else:
            _query = self.get_config_value("query")
            _relation_name = self.get_config_value("relation_name")

        if _relation_name.upper() in RESERVED_SQL_KEYWORDS:
            raise KiaraProcessingException(
                f"Invalid relation name '{_relation_name}': this is a reserved sql keyword, please select a different name."
            )

        _table: KiaraTable = inputs.get_value_data("table")
        rel_from_arrow = duckdb.arrow(_table.arrow_table)
        result: duckdb.DuckDBPyRelation = rel_from_arrow.query(_relation_name, _query)

        outputs.set_value("query_result", result.arrow())

Attributes

_config_cls = QueryTableSQLModuleConfig class-attribute instance-attribute

Functions

create_inputs_schema() -> ValueMapSchema
Source code in /opt/hostedtoolcache/Python/3.11.4/x64/lib/python3.11/site-packages/kiara_plugin/tabular/modules/table/__init__.py
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
def create_inputs_schema(
    self,
) -> ValueMapSchema:

    inputs = {
        "table": {
            "type": "table",
            "doc": "The table to query",
        }
    }

    if self.get_config_value("query") is None:
        inputs["query"] = {
            "type": "string",
            "doc": "The query, use the value of the 'relation_name' input as table, e.g. 'select * from data'.",
        }
        inputs["relation_name"] = {
            "type": "string",
            "doc": "The name the table is referred to in the sql query.",
            "default": "data",
        }

    return inputs
create_outputs_schema() -> ValueMapSchema
Source code in /opt/hostedtoolcache/Python/3.11.4/x64/lib/python3.11/site-packages/kiara_plugin/tabular/modules/table/__init__.py
474
475
476
477
478
def create_outputs_schema(
    self,
) -> ValueMapSchema:

    return {"query_result": {"type": "table", "doc": "The query result."}}
process(inputs: ValueMap, outputs: ValueMap) -> None
Source code in /opt/hostedtoolcache/Python/3.11.4/x64/lib/python3.11/site-packages/kiara_plugin/tabular/modules/table/__init__.py
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
def process(self, inputs: ValueMap, outputs: ValueMap) -> None:

    import duckdb

    if self.get_config_value("query") is None:
        _query: str = inputs.get_value_data("query")
        _relation_name: str = inputs.get_value_data("relation_name")
    else:
        _query = self.get_config_value("query")
        _relation_name = self.get_config_value("relation_name")

    if _relation_name.upper() in RESERVED_SQL_KEYWORDS:
        raise KiaraProcessingException(
            f"Invalid relation name '{_relation_name}': this is a reserved sql keyword, please select a different name."
        )

    _table: KiaraTable = inputs.get_value_data("table")
    rel_from_arrow = duckdb.arrow(_table.arrow_table)
    result: duckdb.DuckDBPyRelation = rel_from_arrow.query(_relation_name, _query)

    outputs.set_value("query_result", result.arrow())

ExportTableModule

Bases: DataExportModule

Export table data items.

Source code in /opt/hostedtoolcache/Python/3.11.4/x64/lib/python3.11/site-packages/kiara_plugin/tabular/modules/table/__init__.py
503
504
505
506
507
508
509
510
511
512
513
514
515
516
class ExportTableModule(DataExportModule):
    """Export table data items."""

    _module_type_name = "export.table"

    def export__table__as__csv_file(self, value: KiaraTable, base_path: str, name: str):
        """Export a table as csv file."""

        from pyarrow import csv

        target_path = os.path.join(base_path, f"{name}.csv")
        csv.write_csv(value.arrow_table, target_path)

        return {"files": target_path}

Functions

export__table__as__csv_file(value: KiaraTable, base_path: str, name: str)

Export a table as csv file.

Source code in /opt/hostedtoolcache/Python/3.11.4/x64/lib/python3.11/site-packages/kiara_plugin/tabular/modules/table/__init__.py
508
509
510
511
512
513
514
515
516
def export__table__as__csv_file(self, value: KiaraTable, base_path: str, name: str):
    """Export a table as csv file."""

    from pyarrow import csv

    target_path = os.path.join(base_path, f"{name}.csv")
    csv.write_csv(value.arrow_table, target_path)

    return {"files": target_path}

RenderTableModuleBase

Bases: RenderValueModule

Source code in /opt/hostedtoolcache/Python/3.11.4/x64/lib/python3.11/site-packages/kiara_plugin/tabular/modules/table/__init__.py
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
class RenderTableModuleBase(RenderValueModule):

    _module_type_name: str = None  # type: ignore

    def preprocess_table(
        self, value: Value, input_number_of_rows: int, input_row_offset: int
    ):

        import duckdb
        import pyarrow as pa

        if value.data_type_name == "array":
            array: KiaraArray = value.data
            arrow_table = pa.table(data=[array.arrow_array], names=["array"])
            column_names: Iterable[str] = ["array"]
        else:
            table: KiaraTable = value.data
            arrow_table = table.arrow_table
            column_names = table.column_names

        columnns = [f'"{x}"' if not x.startswith('"') else x for x in column_names]

        query = f"""SELECT {', '.join(columnns)} FROM data LIMIT {input_number_of_rows} OFFSET {input_row_offset}"""

        rel_from_arrow = duckdb.arrow(arrow_table)
        query_result: duckdb.DuckDBPyRelation = rel_from_arrow.query("data", query)

        result_table = query_result.arrow()
        wrap = ArrowTabularWrap(table=result_table)

        related_scenes: Dict[str, Union[None, RenderScene]] = {}

        row_offset = arrow_table.num_rows - input_number_of_rows

        if row_offset > 0:

            if input_row_offset > 0:
                related_scenes["first"] = RenderScene.construct(
                    title="first",
                    description=f"Display the first {input_number_of_rows} rows of this table.",
                    manifest_hash=self.manifest.manifest_hash,
                    render_config={
                        "row_offset": 0,
                        "number_of_rows": input_number_of_rows,
                    },
                )

                p_offset = input_row_offset - input_number_of_rows
                if p_offset < 0:
                    p_offset = 0
                previous = {
                    "row_offset": p_offset,
                    "number_of_rows": input_number_of_rows,
                }
                related_scenes["previous"] = RenderScene.construct(title="previous", description=f"Display the previous {input_number_of_rows} rows of this table.", manifest_hash=self.manifest.manifest_hash, render_config=previous)  # type: ignore
            else:
                related_scenes["first"] = None
                related_scenes["previous"] = None

            n_offset = input_row_offset + input_number_of_rows
            if n_offset < arrow_table.num_rows:
                next = {"row_offset": n_offset, "number_of_rows": input_number_of_rows}
                related_scenes["next"] = RenderScene.construct(title="next", description=f"Display the next {input_number_of_rows} rows of this table.", manifest_hash=self.manifest.manifest_hash, render_config=next)  # type: ignore
            else:
                related_scenes["next"] = None

            last_page = int(arrow_table.num_rows / input_number_of_rows)
            current_start = last_page * input_number_of_rows
            if (input_row_offset + input_number_of_rows) > arrow_table.num_rows:
                related_scenes["last"] = None
            else:
                related_scenes["last"] = RenderScene.construct(
                    title="last",
                    description="Display the final rows of this table.",
                    manifest_hash=self.manifest.manifest_hash,
                    render_config={
                        "row_offset": current_start,  # type: ignore
                        "number_of_rows": input_number_of_rows,  # type: ignore
                    },
                )
        else:
            related_scenes["first"] = None
            related_scenes["previous"] = None
            related_scenes["next"] = None
            related_scenes["last"] = None

        return wrap, related_scenes

Functions

preprocess_table(value: Value, input_number_of_rows: int, input_row_offset: int)
Source code in /opt/hostedtoolcache/Python/3.11.4/x64/lib/python3.11/site-packages/kiara_plugin/tabular/modules/table/__init__.py
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
def preprocess_table(
    self, value: Value, input_number_of_rows: int, input_row_offset: int
):

    import duckdb
    import pyarrow as pa

    if value.data_type_name == "array":
        array: KiaraArray = value.data
        arrow_table = pa.table(data=[array.arrow_array], names=["array"])
        column_names: Iterable[str] = ["array"]
    else:
        table: KiaraTable = value.data
        arrow_table = table.arrow_table
        column_names = table.column_names

    columnns = [f'"{x}"' if not x.startswith('"') else x for x in column_names]

    query = f"""SELECT {', '.join(columnns)} FROM data LIMIT {input_number_of_rows} OFFSET {input_row_offset}"""

    rel_from_arrow = duckdb.arrow(arrow_table)
    query_result: duckdb.DuckDBPyRelation = rel_from_arrow.query("data", query)

    result_table = query_result.arrow()
    wrap = ArrowTabularWrap(table=result_table)

    related_scenes: Dict[str, Union[None, RenderScene]] = {}

    row_offset = arrow_table.num_rows - input_number_of_rows

    if row_offset > 0:

        if input_row_offset > 0:
            related_scenes["first"] = RenderScene.construct(
                title="first",
                description=f"Display the first {input_number_of_rows} rows of this table.",
                manifest_hash=self.manifest.manifest_hash,
                render_config={
                    "row_offset": 0,
                    "number_of_rows": input_number_of_rows,
                },
            )

            p_offset = input_row_offset - input_number_of_rows
            if p_offset < 0:
                p_offset = 0
            previous = {
                "row_offset": p_offset,
                "number_of_rows": input_number_of_rows,
            }
            related_scenes["previous"] = RenderScene.construct(title="previous", description=f"Display the previous {input_number_of_rows} rows of this table.", manifest_hash=self.manifest.manifest_hash, render_config=previous)  # type: ignore
        else:
            related_scenes["first"] = None
            related_scenes["previous"] = None

        n_offset = input_row_offset + input_number_of_rows
        if n_offset < arrow_table.num_rows:
            next = {"row_offset": n_offset, "number_of_rows": input_number_of_rows}
            related_scenes["next"] = RenderScene.construct(title="next", description=f"Display the next {input_number_of_rows} rows of this table.", manifest_hash=self.manifest.manifest_hash, render_config=next)  # type: ignore
        else:
            related_scenes["next"] = None

        last_page = int(arrow_table.num_rows / input_number_of_rows)
        current_start = last_page * input_number_of_rows
        if (input_row_offset + input_number_of_rows) > arrow_table.num_rows:
            related_scenes["last"] = None
        else:
            related_scenes["last"] = RenderScene.construct(
                title="last",
                description="Display the final rows of this table.",
                manifest_hash=self.manifest.manifest_hash,
                render_config={
                    "row_offset": current_start,  # type: ignore
                    "number_of_rows": input_number_of_rows,  # type: ignore
                },
            )
    else:
        related_scenes["first"] = None
        related_scenes["previous"] = None
        related_scenes["next"] = None
        related_scenes["last"] = None

    return wrap, related_scenes

RenderTableModule

Bases: RenderTableModuleBase

Source code in /opt/hostedtoolcache/Python/3.11.4/x64/lib/python3.11/site-packages/kiara_plugin/tabular/modules/table/__init__.py
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
class RenderTableModule(RenderTableModuleBase):
    _module_type_name = "render.table"

    def render__table__as__string(self, value: Value, render_config: Mapping[str, Any]):

        input_number_of_rows = render_config.get("number_of_rows", 20)
        input_row_offset = render_config.get("row_offset", 0)

        wrap, data_related_scenes = self.preprocess_table(
            value=value,
            input_number_of_rows=input_number_of_rows,
            input_row_offset=input_row_offset,
        )
        pretty = wrap.as_string(max_row_height=1)

        return RenderValueResult(
            value_id=value.value_id,
            render_config=render_config,
            render_manifest=self.manifest.manifest_hash,
            rendered=pretty,
            related_scenes=data_related_scenes,
        )

    def render__table__as__terminal_renderable(
        self, value: Value, render_config: Mapping[str, Any]
    ):

        input_number_of_rows = render_config.get("number_of_rows", 20)
        input_row_offset = render_config.get("row_offset", 0)

        wrap, data_related_scenes = self.preprocess_table(
            value=value,
            input_number_of_rows=input_number_of_rows,
            input_row_offset=input_row_offset,
        )
        pretty = wrap.as_terminal_renderable(max_row_height=1)

        return RenderValueResult(
            value_id=value.value_id,
            render_config=render_config,
            render_manifest=self.manifest.manifest_hash,
            rendered=pretty,
            related_scenes=data_related_scenes,
        )

Functions

render__table__as__string(value: Value, render_config: Mapping[str, Any])
Source code in /opt/hostedtoolcache/Python/3.11.4/x64/lib/python3.11/site-packages/kiara_plugin/tabular/modules/table/__init__.py
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
def render__table__as__string(self, value: Value, render_config: Mapping[str, Any]):

    input_number_of_rows = render_config.get("number_of_rows", 20)
    input_row_offset = render_config.get("row_offset", 0)

    wrap, data_related_scenes = self.preprocess_table(
        value=value,
        input_number_of_rows=input_number_of_rows,
        input_row_offset=input_row_offset,
    )
    pretty = wrap.as_string(max_row_height=1)

    return RenderValueResult(
        value_id=value.value_id,
        render_config=render_config,
        render_manifest=self.manifest.manifest_hash,
        rendered=pretty,
        related_scenes=data_related_scenes,
    )
render__table__as__terminal_renderable(value: Value, render_config: Mapping[str, Any])
Source code in /opt/hostedtoolcache/Python/3.11.4/x64/lib/python3.11/site-packages/kiara_plugin/tabular/modules/table/__init__.py
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
def render__table__as__terminal_renderable(
    self, value: Value, render_config: Mapping[str, Any]
):

    input_number_of_rows = render_config.get("number_of_rows", 20)
    input_row_offset = render_config.get("row_offset", 0)

    wrap, data_related_scenes = self.preprocess_table(
        value=value,
        input_number_of_rows=input_number_of_rows,
        input_row_offset=input_row_offset,
    )
    pretty = wrap.as_terminal_renderable(max_row_height=1)

    return RenderValueResult(
        value_id=value.value_id,
        render_config=render_config,
        render_manifest=self.manifest.manifest_hash,
        rendered=pretty,
        related_scenes=data_related_scenes,
    )