Skip to content

table

Attributes

Classes

TableType

Bases: AnyType[KiaraTable, DataTypeConfig]

Tabular data (table, spreadsheet, data_frame, what have you).

The table data is organized in sets of columns (arrays of data of the same type), with each column having a string identifier.

kiara uses an instance of the KiaraTable class to manage the table data, which let's developers access it in different formats (Apache Arrow Table, Pandas dataframe, Python dict of lists, more to follow...).

Please consult the API doc of the KiaraTable class for more information about how to access and query the data:

Internally, the data is stored in Apache Feather format -- both in memory and on disk when saved, which enables some advanced usage to preserve memory and compute overhead.

Source code in /opt/hostedtoolcache/Python/3.11.4/x64/lib/python3.11/site-packages/kiara_plugin/tabular/data_types/table.py
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
class TableType(AnyType[KiaraTable, DataTypeConfig]):
    """Tabular data (table, spreadsheet, data_frame, what have you).

    The table data is organized in sets of columns (arrays of data of the same type), with each column having a string identifier.

    *kiara* uses an instance of the [`KiaraTable`][kiara_plugin.tabular.models.table.KiaraTable]
    class to manage the table data, which let's developers access it in different formats ([Apache Arrow Table](https://arrow.apache.org/docs/python/generated/pyarrow.Table.html), [Pandas dataframe](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html), Python dict of lists, more to follow...).

    Please consult the API doc of the `KiaraTable` class for more information about how to access and query the data:

    - [`KiaraTable` API doc](https://dharpa.org/kiara_plugin.tabular/latest/reference/kiara_plugin/tabular/models/__init__/#kiara_plugin.tabular.models.table.KiaraTable)

    Internally, the data is stored in [Apache Feather format](https://arrow.apache.org/docs/python/feather.html) -- both
    in memory and on disk when saved, which enables some advanced usage to preserve memory and compute overhead.
    """

    _data_type_name = "table"

    @classmethod
    def python_class(cls) -> Type:
        return KiaraTable

    def parse_python_obj(self, data: Any) -> KiaraTable:

        return KiaraTable.create_table(data)

    def _validate(cls, value: Any) -> None:

        pass

        if not isinstance(value, KiaraTable):
            raise Exception(
                f"invalid type '{type(value).__name__}', must be 'KiaraTable'."
            )

    def serialize(self, data: KiaraTable) -> SerializedData:

        import pyarrow as pa

        chunk_map = {}

        # TODO: make sure temp dir is in the same partition as file store
        temp_f = tempfile.mkdtemp()

        def cleanup():
            shutil.rmtree(temp_f, ignore_errors=True)

        atexit.register(cleanup)

        table = data.arrow_table

        schema_bytes = table.schema.serialize().to_pybytes()
        chunk_map[TABLE_SCHEMA_CHUNKS_NAME] = {
            "type": "chunk",
            "chunk": schema_bytes,
            "codec": "raw",
        }
        for column_name in table.column_names:

            assert column_name != TABLE_SCHEMA_CHUNKS_NAME

            column: pa.Array = table.column(column_name)

            if not column_name:
                file_name = os.path.join(temp_f, EMPTY_COLUMN_NAME_MARKER)
            else:
                file_name = os.path.join(temp_f, column_name)
            store_array(array_obj=column, file_name=file_name, column_name=column_name)
            chunk_map[column_name] = {"type": "file", "file": file_name, "codec": "raw"}

        serialized_data = {
            "data_type": self.data_type_name,
            "data_type_config": self.type_config.dict(),
            "data": chunk_map,
            "serialization_profile": "feather",
            "metadata": {
                "environment": {},
                "deserialize": {
                    "python_object": {
                        "module_type": "load.table",
                        "module_config": {
                            "value_type": "table",
                            "target_profile": "python_object",
                            "serialization_profile": "feather",
                        },
                    }
                },
            },
        }

        serialized = SerializationResult(**serialized_data)
        return serialized

    def pretty_print_as__terminal_renderable(
        self, value: "Value", render_config: Mapping[str, Any]
    ) -> Any:

        max_rows = render_config.get(
            "max_no_rows", DEFAULT_PRETTY_PRINT_CONFIG["max_no_rows"]
        )
        max_row_height = render_config.get(
            "max_row_height", DEFAULT_PRETTY_PRINT_CONFIG["max_row_height"]
        )
        max_cell_length = render_config.get(
            "max_cell_length", DEFAULT_PRETTY_PRINT_CONFIG["max_cell_length"]
        )

        half_lines: Union[int, None] = None
        if max_rows:
            half_lines = int(max_rows / 2)

        atw = ArrowTabularWrap(value.data.arrow_table)
        result = atw.as_terminal_renderable(
            rows_head=half_lines,
            rows_tail=half_lines,
            max_row_height=max_row_height,
            max_cell_length=max_cell_length,
        )
        return result

    def pretty_print_as__string(
        self, value: "Value", render_config: Mapping[str, Any]
    ) -> Any:

        max_rows = render_config.get(
            "max_no_rows", DEFAULT_PRETTY_PRINT_CONFIG["max_no_rows"]
        )
        max_row_height = render_config.get(
            "max_row_height", DEFAULT_PRETTY_PRINT_CONFIG["max_row_height"]
        )
        max_cell_length = render_config.get(
            "max_cell_length", DEFAULT_PRETTY_PRINT_CONFIG["max_cell_length"]
        )

        half_lines: Union[int, None] = None
        if max_rows:
            half_lines = int(max_rows / 2)

        atw = ArrowTabularWrap(value.data.arrow_table)
        result = atw.as_string(
            rows_head=half_lines,
            rows_tail=half_lines,
            max_row_height=max_row_height,
            max_cell_length=max_cell_length,
        )
        return result

Functions

python_class() -> Type classmethod
Source code in /opt/hostedtoolcache/Python/3.11.4/x64/lib/python3.11/site-packages/kiara_plugin/tabular/data_types/table.py
37
38
39
@classmethod
def python_class(cls) -> Type:
    return KiaraTable
parse_python_obj(data: Any) -> KiaraTable
Source code in /opt/hostedtoolcache/Python/3.11.4/x64/lib/python3.11/site-packages/kiara_plugin/tabular/data_types/table.py
41
42
43
def parse_python_obj(self, data: Any) -> KiaraTable:

    return KiaraTable.create_table(data)
serialize(data: KiaraTable) -> SerializedData
Source code in /opt/hostedtoolcache/Python/3.11.4/x64/lib/python3.11/site-packages/kiara_plugin/tabular/data_types/table.py
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
def serialize(self, data: KiaraTable) -> SerializedData:

    import pyarrow as pa

    chunk_map = {}

    # TODO: make sure temp dir is in the same partition as file store
    temp_f = tempfile.mkdtemp()

    def cleanup():
        shutil.rmtree(temp_f, ignore_errors=True)

    atexit.register(cleanup)

    table = data.arrow_table

    schema_bytes = table.schema.serialize().to_pybytes()
    chunk_map[TABLE_SCHEMA_CHUNKS_NAME] = {
        "type": "chunk",
        "chunk": schema_bytes,
        "codec": "raw",
    }
    for column_name in table.column_names:

        assert column_name != TABLE_SCHEMA_CHUNKS_NAME

        column: pa.Array = table.column(column_name)

        if not column_name:
            file_name = os.path.join(temp_f, EMPTY_COLUMN_NAME_MARKER)
        else:
            file_name = os.path.join(temp_f, column_name)
        store_array(array_obj=column, file_name=file_name, column_name=column_name)
        chunk_map[column_name] = {"type": "file", "file": file_name, "codec": "raw"}

    serialized_data = {
        "data_type": self.data_type_name,
        "data_type_config": self.type_config.dict(),
        "data": chunk_map,
        "serialization_profile": "feather",
        "metadata": {
            "environment": {},
            "deserialize": {
                "python_object": {
                    "module_type": "load.table",
                    "module_config": {
                        "value_type": "table",
                        "target_profile": "python_object",
                        "serialization_profile": "feather",
                    },
                }
            },
        },
    }

    serialized = SerializationResult(**serialized_data)
    return serialized
pretty_print_as__terminal_renderable(value: Value, render_config: Mapping[str, Any]) -> Any
Source code in /opt/hostedtoolcache/Python/3.11.4/x64/lib/python3.11/site-packages/kiara_plugin/tabular/data_types/table.py
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
def pretty_print_as__terminal_renderable(
    self, value: "Value", render_config: Mapping[str, Any]
) -> Any:

    max_rows = render_config.get(
        "max_no_rows", DEFAULT_PRETTY_PRINT_CONFIG["max_no_rows"]
    )
    max_row_height = render_config.get(
        "max_row_height", DEFAULT_PRETTY_PRINT_CONFIG["max_row_height"]
    )
    max_cell_length = render_config.get(
        "max_cell_length", DEFAULT_PRETTY_PRINT_CONFIG["max_cell_length"]
    )

    half_lines: Union[int, None] = None
    if max_rows:
        half_lines = int(max_rows / 2)

    atw = ArrowTabularWrap(value.data.arrow_table)
    result = atw.as_terminal_renderable(
        rows_head=half_lines,
        rows_tail=half_lines,
        max_row_height=max_row_height,
        max_cell_length=max_cell_length,
    )
    return result
pretty_print_as__string(value: Value, render_config: Mapping[str, Any]) -> Any
Source code in /opt/hostedtoolcache/Python/3.11.4/x64/lib/python3.11/site-packages/kiara_plugin/tabular/data_types/table.py
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
def pretty_print_as__string(
    self, value: "Value", render_config: Mapping[str, Any]
) -> Any:

    max_rows = render_config.get(
        "max_no_rows", DEFAULT_PRETTY_PRINT_CONFIG["max_no_rows"]
    )
    max_row_height = render_config.get(
        "max_row_height", DEFAULT_PRETTY_PRINT_CONFIG["max_row_height"]
    )
    max_cell_length = render_config.get(
        "max_cell_length", DEFAULT_PRETTY_PRINT_CONFIG["max_cell_length"]
    )

    half_lines: Union[int, None] = None
    if max_rows:
        half_lines = int(max_rows / 2)

    atw = ArrowTabularWrap(value.data.arrow_table)
    result = atw.as_string(
        rows_head=half_lines,
        rows_tail=half_lines,
        max_row_height=max_row_height,
        max_cell_length=max_cell_length,
    )
    return result

Functions