Skip to content

array

Classes

KiaraArray (KiaraModel) pydantic-model

A class to manage array-like data.

Internally, this uses an Apache Arrow Array to handle the data in memory and on disk.

Source code in tabular/models/array.py
class KiaraArray(KiaraModel):
    """A class to manage array-like data.

    Internally, this uses an [Apache Arrow Array](https://arrow.apache.org/docs/python/generated/pyarrow.Array.html#pyarrow.Array) to handle the data in memory and on disk.
    """

    # @classmethod
    # def create_in_temp_dir(cls, ):
    #
    #     temp_f = tempfile.mkdtemp()
    #     file_path = os.path.join(temp_f, "array.feather")
    #
    #     def cleanup():
    #         shutil.rmtree(file_path, ignore_errors=True)
    #
    #     atexit.register(cleanup)
    #
    #     array_obj = cls(feather_path=file_path)
    #     return array_obj

    @classmethod
    def create_array(cls, data: Any) -> "KiaraArray":

        if isinstance(data, KiaraArray):
            return data

        array_obj = None
        if isinstance(data, (pa.Array, pa.ChunkedArray)):
            array_obj = data
        elif isinstance(data, pa.Table):
            if len(data.columns) != 1:
                raise Exception(
                    f"Invalid type, only Arrow Arrays or single-column Tables allowed. This value is a table with {len(data.columns)} columns."
                )
            array_obj = data.column(0)
        else:
            try:
                array_obj = pa.array(data)
            except Exception:
                pass

        if array_obj is None:
            raise Exception(
                f"Can't create table, invalid source data type: {type(data)}."
            )

        obj = KiaraArray()
        if not isinstance(array_obj, pa.lib.ChunkedArray):
            array_obj = pa.chunked_array(array_obj)
        obj._array_obj = array_obj
        return obj

    data_path: Union[str, None] = Field(
        description="The path to the (feather) file backing this array.", default=None
    )

    _array_obj: pa.Array = PrivateAttr(default=None)

    def _retrieve_data_to_hash(self) -> Any:
        raise NotImplementedError()

    def __len__(self):
        return len(self.arrow_array)

    @property
    def arrow_array(self) -> pa.Array:

        if self._array_obj is not None:
            return self._array_obj

        if not self.data_path:
            raise Exception("Can't retrieve array data, object not initialized (yet).")

        with pa.memory_map(self.data_path, "r") as source:
            table: pa.Table = pa.ipc.open_file(source).read_all()

        if len(table.columns) != 1:
            raise Exception(
                f"Invalid serialized array data, only a single-column Table is allowed. This value is a table with {len(table.columns)} columns."
            )

        self._array_obj = table.column(0)
        return self._array_obj

    def to_pylist(self):
        return self.arrow_array.to_pylist()

    def to_pandas(self):
        return self.arrow_array.to_pandas()

Attributes

arrow_array: Array property readonly
data_path: str pydantic-field

The path to the (feather) file backing this array.

create_array(data) classmethod
Source code in tabular/models/array.py
@classmethod
def create_array(cls, data: Any) -> "KiaraArray":

    if isinstance(data, KiaraArray):
        return data

    array_obj = None
    if isinstance(data, (pa.Array, pa.ChunkedArray)):
        array_obj = data
    elif isinstance(data, pa.Table):
        if len(data.columns) != 1:
            raise Exception(
                f"Invalid type, only Arrow Arrays or single-column Tables allowed. This value is a table with {len(data.columns)} columns."
            )
        array_obj = data.column(0)
    else:
        try:
            array_obj = pa.array(data)
        except Exception:
            pass

    if array_obj is None:
        raise Exception(
            f"Can't create table, invalid source data type: {type(data)}."
        )

    obj = KiaraArray()
    if not isinstance(array_obj, pa.lib.ChunkedArray):
        array_obj = pa.chunked_array(array_obj)
    obj._array_obj = array_obj
    return obj
to_pandas(self)
Source code in tabular/models/array.py
def to_pandas(self):
    return self.arrow_array.to_pandas()
to_pylist(self)
Source code in tabular/models/array.py
def to_pylist(self):
    return self.arrow_array.to_pylist()