Skip to content

array

Classes

ArrayType

Bases: AnyType[KiaraArray, DataTypeConfig]

An array, in most cases used as a column within a table.

Internally, this type uses the KiaraArray wrapper class to manage array data. This wrapper class, in turn, uses an Apache Arrow Array to store the data in memory (and on disk).

Source code in /opt/hostedtoolcache/Python/3.11.4/x64/lib/python3.11/site-packages/kiara_plugin/tabular/data_types/array.py
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
class ArrayType(AnyType[KiaraArray, DataTypeConfig]):
    """An array, in most cases used as a column within a table.

    Internally, this type uses the [KiaraArray][kiara_plugin.tabular.models.array.KiaraArray] wrapper class to manage array data. This wrapper class, in turn, uses an [Apache Arrow](https://arrow.apache.org) [Array](https://arrow.apache.org/docs/python/generated/pyarrow.Array.html#pyarrow.Array) to store the data in memory (and on disk).
    """

    _data_type_name = "array"

    @classmethod
    def python_class(cls) -> Type:
        return KiaraArray

    def parse_python_obj(self, data: Any) -> KiaraArray:

        return KiaraArray.create_array(data)

    def _validate(cls, value: Any) -> None:

        if not isinstance(value, (KiaraArray)):
            raise Exception(
                f"Invalid type '{type(value).__name__}', must be an instance of the 'KiaraArray' class."
            )

    def serialize(self, data: KiaraArray) -> SerializedData:

        import pyarrow as pa

        # TODO: make sure temp dir is in the same partition as file store
        temp_f = tempfile.mkdtemp()

        def cleanup():
            shutil.rmtree(temp_f, ignore_errors=True)

        atexit.register(cleanup)

        column: pa.Array = data.arrow_array
        file_name = os.path.join(temp_f, "array.arrow")

        store_array(array_obj=column, file_name=file_name, column_name="array")

        chunks = {"array.arrow": {"type": "file", "codec": "raw", "file": file_name}}

        serialized_data = {
            "data_type": self.data_type_name,
            "data_type_config": self.type_config.dict(),
            "data": chunks,
            "serialization_profile": "feather",
            "metadata": {
                "environment": {},
                "deserialize": {
                    "python_object": {
                        "module_type": "load.array",
                        "module_config": {
                            "value_type": "array",
                            "target_profile": "python_object",
                            "serialization_profile": "feather",
                        },
                    }
                },
            },
        }

        serialized = SerializationResult(**serialized_data)
        return serialized

    def pretty_print_as__terminal_renderable(
        self, value: Value, render_config: Mapping[str, Any]
    ) -> Any:

        max_rows = render_config.get(
            "max_no_rows", DEFAULT_PRETTY_PRINT_CONFIG["max_no_rows"]
        )
        max_row_height = render_config.get(
            "max_row_height", DEFAULT_PRETTY_PRINT_CONFIG["max_row_height"]
        )
        max_cell_length = render_config.get(
            "max_cell_length", DEFAULT_PRETTY_PRINT_CONFIG["max_cell_length"]
        )

        half_lines: Union[int, None] = None
        if max_rows:
            half_lines = int(max_rows / 2)

        import pyarrow as pa

        array: pa.Array = value.data.arrow_array

        temp_table = pa.Table.from_arrays(arrays=[array], names=["array"])
        atw = ArrowTabularWrap(temp_table)
        result = atw.as_terminal_renderable(
            rows_head=half_lines,
            rows_tail=half_lines,
            max_row_height=max_row_height,
            max_cell_length=max_cell_length,
            show_table_header=False,
        )

        return result

    def pretty_print_as__string(
        self, value: Value, render_config: Mapping[str, Any]
    ) -> Any:

        max_rows = render_config.get(
            "max_no_rows", DEFAULT_PRETTY_PRINT_CONFIG["max_no_rows"]
        )
        max_row_height = render_config.get(
            "max_row_height", DEFAULT_PRETTY_PRINT_CONFIG["max_row_height"]
        )
        max_cell_length = render_config.get(
            "max_cell_length", DEFAULT_PRETTY_PRINT_CONFIG["max_cell_length"]
        )

        half_lines: Union[int, None] = None
        if max_rows:
            half_lines = int(max_rows / 2)

        import pyarrow as pa

        array: pa.Array = value.data.arrow_array

        temp_table = pa.Table.from_arrays(arrays=[array], names=["array"])
        atw = ArrowTabularWrap(temp_table)
        result = atw.as_string(
            rows_head=half_lines,
            rows_tail=half_lines,
            max_row_height=max_row_height,
            max_cell_length=max_cell_length,
        )

        return result

Functions

python_class() -> Type classmethod
Source code in /opt/hostedtoolcache/Python/3.11.4/x64/lib/python3.11/site-packages/kiara_plugin/tabular/data_types/array.py
45
46
47
@classmethod
def python_class(cls) -> Type:
    return KiaraArray
parse_python_obj(data: Any) -> KiaraArray
Source code in /opt/hostedtoolcache/Python/3.11.4/x64/lib/python3.11/site-packages/kiara_plugin/tabular/data_types/array.py
49
50
51
def parse_python_obj(self, data: Any) -> KiaraArray:

    return KiaraArray.create_array(data)
serialize(data: KiaraArray) -> SerializedData
Source code in /opt/hostedtoolcache/Python/3.11.4/x64/lib/python3.11/site-packages/kiara_plugin/tabular/data_types/array.py
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
def serialize(self, data: KiaraArray) -> SerializedData:

    import pyarrow as pa

    # TODO: make sure temp dir is in the same partition as file store
    temp_f = tempfile.mkdtemp()

    def cleanup():
        shutil.rmtree(temp_f, ignore_errors=True)

    atexit.register(cleanup)

    column: pa.Array = data.arrow_array
    file_name = os.path.join(temp_f, "array.arrow")

    store_array(array_obj=column, file_name=file_name, column_name="array")

    chunks = {"array.arrow": {"type": "file", "codec": "raw", "file": file_name}}

    serialized_data = {
        "data_type": self.data_type_name,
        "data_type_config": self.type_config.dict(),
        "data": chunks,
        "serialization_profile": "feather",
        "metadata": {
            "environment": {},
            "deserialize": {
                "python_object": {
                    "module_type": "load.array",
                    "module_config": {
                        "value_type": "array",
                        "target_profile": "python_object",
                        "serialization_profile": "feather",
                    },
                }
            },
        },
    }

    serialized = SerializationResult(**serialized_data)
    return serialized
pretty_print_as__terminal_renderable(value: Value, render_config: Mapping[str, Any]) -> Any
Source code in /opt/hostedtoolcache/Python/3.11.4/x64/lib/python3.11/site-packages/kiara_plugin/tabular/data_types/array.py
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
def pretty_print_as__terminal_renderable(
    self, value: Value, render_config: Mapping[str, Any]
) -> Any:

    max_rows = render_config.get(
        "max_no_rows", DEFAULT_PRETTY_PRINT_CONFIG["max_no_rows"]
    )
    max_row_height = render_config.get(
        "max_row_height", DEFAULT_PRETTY_PRINT_CONFIG["max_row_height"]
    )
    max_cell_length = render_config.get(
        "max_cell_length", DEFAULT_PRETTY_PRINT_CONFIG["max_cell_length"]
    )

    half_lines: Union[int, None] = None
    if max_rows:
        half_lines = int(max_rows / 2)

    import pyarrow as pa

    array: pa.Array = value.data.arrow_array

    temp_table = pa.Table.from_arrays(arrays=[array], names=["array"])
    atw = ArrowTabularWrap(temp_table)
    result = atw.as_terminal_renderable(
        rows_head=half_lines,
        rows_tail=half_lines,
        max_row_height=max_row_height,
        max_cell_length=max_cell_length,
        show_table_header=False,
    )

    return result
pretty_print_as__string(value: Value, render_config: Mapping[str, Any]) -> Any
Source code in /opt/hostedtoolcache/Python/3.11.4/x64/lib/python3.11/site-packages/kiara_plugin/tabular/data_types/array.py
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
def pretty_print_as__string(
    self, value: Value, render_config: Mapping[str, Any]
) -> Any:

    max_rows = render_config.get(
        "max_no_rows", DEFAULT_PRETTY_PRINT_CONFIG["max_no_rows"]
    )
    max_row_height = render_config.get(
        "max_row_height", DEFAULT_PRETTY_PRINT_CONFIG["max_row_height"]
    )
    max_cell_length = render_config.get(
        "max_cell_length", DEFAULT_PRETTY_PRINT_CONFIG["max_cell_length"]
    )

    half_lines: Union[int, None] = None
    if max_rows:
        half_lines = int(max_rows / 2)

    import pyarrow as pa

    array: pa.Array = value.data.arrow_array

    temp_table = pa.Table.from_arrays(arrays=[array], names=["array"])
    atw = ArrowTabularWrap(temp_table)
    result = atw.as_string(
        rows_head=half_lines,
        rows_tail=half_lines,
        max_row_height=max_row_height,
        max_cell_length=max_cell_length,
    )

    return result

Functions

store_array(array_obj: pa.Array, file_name: str, column_name: str = 'array')

Utility methdo to stora an array to a file.

Source code in /opt/hostedtoolcache/Python/3.11.4/x64/lib/python3.11/site-packages/kiara_plugin/tabular/data_types/array.py
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
def store_array(array_obj: "pa.Array", file_name: str, column_name: "str" = "array"):
    """Utility methdo to stora an array to a file."""

    import pyarrow as pa
    from pyarrow import ChunkedArray

    schema = pa.schema([pa.field(column_name, array_obj.type)])

    # TODO: support non-single chunk columns
    with pa.OSFile(file_name, "wb") as sink:
        with pa.ipc.new_file(sink, schema=schema) as writer:
            if isinstance(array_obj, ChunkedArray):
                for chunk in array_obj.chunks:
                    batch = pa.record_batch([chunk], schema=schema)
                    writer.write(batch)
            else:
                raise NotImplementedError()