Source code for geoh5py.data.text_data
# Copyright (c) 2024 Mira Geoscience Ltd.
#
# This file is part of geoh5py.
#
# geoh5py is free software: you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# geoh5py is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with geoh5py. If not, see <https://www.gnu.org/licenses/>.
from __future__ import annotations
import json
import numpy as np
from numpy import ndarray
from ..shared.utils import as_str_if_uuid, dict_mapper
from .data import Data
from .primitive_type_enum import PrimitiveTypeEnum
[docs]
def text_formating(values: None | np.ndarray | str) -> ndarray | None:
"""
Format text values to utf-8.
:param values: The values to format.
:return: The formatted values.
"""
# todo: values[0] seems dangerous here
if values is None or isinstance(values[0], bytes):
return values
return np.char.encode(values, encoding="utf-8").astype("O")
[docs]
class TextData(Data):
@property
def formatted_values(self):
return text_formating(self.values)
@property
def nan_value(self):
"""
Nan-Data-Value to be used in arrays
"""
return ""
[docs]
@classmethod
def primitive_type(cls) -> PrimitiveTypeEnum:
return PrimitiveTypeEnum.TEXT
[docs]
def validate_values(
self, values: np.ndarray | str | None
) -> np.ndarray | str | None:
if isinstance(values, bytes):
values = values.decode()
if isinstance(values, np.ndarray) and values.dtype == object:
values = np.array(
[v.decode("utf-8") if isinstance(v, bytes) else v for v in values]
)
if self.n_values is not None and len(values) < self.n_values:
full_array = np.full(self.n_values, self.nan_value, dtype=values.dtype)
full_array[: len(values)] = values
return full_array
if (not isinstance(values, (str, type(None), np.ndarray))) or (
isinstance(values, np.ndarray) and values.dtype.kind not in ["U", "S"]
):
raise ValueError(
f"Input 'values' for {self} must be of type {np.ndarray} str or None."
)
return values
[docs]
class MultiTextData(Data):
_values: np.ndarray | str | None
@property
def formatted_values(self):
return text_formating(self.values)
@property
def nan_value(self):
"""
Value used to represent missing data in python.
"""
return ""
[docs]
@classmethod
def primitive_type(cls) -> PrimitiveTypeEnum:
return PrimitiveTypeEnum.MULTI_TEXT
[docs]
def validate_values(
self, values: np.ndarray | str | None
) -> np.ndarray | str | None:
if isinstance(values, np.ndarray) and self.n_values is not None:
if len(values) < self.n_values:
full_array = np.full(self.n_values, self.nan_value, dtype=values.dtype)
full_array[: len(values)] = values
return full_array
elif not isinstance(values, str | type(None)):
raise ValueError(
f"Input 'values' for {self} must be of type {np.ndarray} str or None."
)
return values