Dataset reader

Classes that simplify imports from fedbiomed.common.dataset_reader

Classes

CsvReader

CsvReader(path, has_header='auto', delimiter=None)

Parameters:

Name Type Description Default
path Path

The path of the csv file that contains the dataset.

required
has_header str | bool

Boolean to indicate whether the file has a header or not. By default it is set as 'auto', which is the case that the reader tries to detect itself whether the file has a header or not.

'auto'
delimiter Optional[str]

The delimiter used in the csv file. By default it is set as None, which is the case that the reader tries to detect itself whether the file has a delimiter or not.

None
Source code in fedbiomed/common/dataset_reader/_csv_reader.py
def __init__(
    self,
    path: Path,
    has_header: str | bool = "auto",
    delimiter: Optional[str] = None,
) -> None:
    """Constructs the csv reader.

    Args:
        path: The path of the csv file that contains the dataset.
        has_header: Boolean to indicate whether the file has a header or not.
            By default it is set as 'auto', which is the case that the reader tries to
            detect itself whether the file has a header or not.
        delimiter: The delimiter used in the csv file.
            By default it is set as None, which is the case that the reader tries to
            detect itself whether the file has a delimiter or not.
    """

    self._path = path

    self._delimiter = delimiter
    self.header: bool | None = None if has_header == "auto" else has_header

    # Pre-parse the CSV file to determine its delimiter and header
    # Note: this will read the first line of the file
    self._pre_parse()

    # Initialize the data and the column names
    self.data = self._read()
    self.columns = list(self.data.columns)

    # Initialize shape and length
    # Defer costly operations
    self._shape = self.data.shape
    self._len = self._shape[0]

Attributes

columns instance-attribute
columns = list(columns)
data instance-attribute
data = _read()
header instance-attribute
header = None if has_header == 'auto' else has_header

Functions

get
get(indexes, columns=None)

Gets the specified rows and columns in the dataset.

Parameters:

Name Type Description Default
indexes int | Iterable

Row indexes to retrieve.

required
columns Optional[Iterable | int | str]

(Optional) list of columns to retrieve.

None

Returns: Polars DataFrame: The specified dataframe.

Source code in fedbiomed/common/dataset_reader/_csv_reader.py
def get(
    self,
    indexes: int | Iterable,
    columns: Optional[Iterable | int | str] = None,
) -> pl.DataFrame:
    """Gets the specified rows and columns in the dataset.

    Args:
        indexes: Row indexes to retrieve.
        columns: (Optional) list of columns to retrieve.
    Returns:
        Polars DataFrame: The specified dataframe.
    """
    return self._get_entry(indexes=indexes, columns=columns)
len
len()

Get number of samples

Source code in fedbiomed/common/dataset_reader/_csv_reader.py
def len(self) -> int:
    """Get number of samples"""
    return self._len
normalize_columns
normalize_columns(columns)

Validates columns and returns them in type list

Parameters:

Name Type Description Default
columns Iterable | int | str

Columns to extract, can be an iterable of column names, a single column name, or an integer index.

required

Returns: A list of column names.

Raises:

Type Description
FedbiomedUserInputError

If the input does not match the types expected

Source code in fedbiomed/common/dataset_reader/_csv_reader.py
def normalize_columns(self, columns: Iterable | int | str) -> list[str]:
    """Validates `columns` and returns them in type `list`

    Args:
        columns: Columns to extract, can be an iterable of column names, a single
            column name, or an integer index.
    Returns:
        A list of column names.

    Raises:
        FedbiomedUserInputError: If the input does not match the types expected
    """

    if isinstance(columns, str) or isinstance(columns, int):
        columns = [columns]

    # if columns is a list of int, convert it to a list of column names
    # (auto-generated by polars as column_0, column_1, etc. if there is no header)
    if all(isinstance(item, int) for item in columns):
        n_cols = len(self.columns)
        _inter = list(filter(lambda x: 0 > x or x > n_cols, columns))
        if any(_inter):
            raise FedbiomedUserInputError(
                f"Column index(es) {_inter} is out of range (0 to {n_cols - 1})"
            )
        columns = [self.columns[i] for i in columns]
    _faulty_col = list(filter(lambda x: x not in self.columns, columns))
    if any(_faulty_col):
        msg = f"Cannot read columns {_faulty_col}: file does not contain those columns specified"
        raise FedbiomedUserInputError(msg)

    return columns
shape
shape()

Returns the shape of the csv dataset.

Computed before applying transforms or conversion to other format.

Returns:

Type Description

Dictionary with the shape and other necessary info for the dataset

Source code in fedbiomed/common/dataset_reader/_csv_reader.py
def shape(self):
    """Returns the shape of the csv dataset.

    Computed before applying transforms or conversion to other format.

    Returns:
        Dictionary with the shape and other necessary info for the dataset
    """
    return {"csv": self._shape}
to_numpy
to_numpy()

Returns the data as a Numpy ndarray.

Source code in fedbiomed/common/dataset_reader/_csv_reader.py
def to_numpy(self):
    """Returns the data as a Numpy ndarray."""
    return self.data.to_numpy()
to_pandas
to_pandas()

Returns the data as a Pandas Dataframe.

Source code in fedbiomed/common/dataset_reader/_csv_reader.py
def to_pandas(self) -> pd.DataFrame:
    """Returns the data as a Pandas Dataframe."""
    return self.data.to_pandas()
unsafe_to_torch
unsafe_to_torch()

This is an unsafe method that returns the data as a Torch Tensor.

Warning: This method requires that columns have homogeneous data types. Havinng mixed types will raise an error.

Source code in fedbiomed/common/dataset_reader/_csv_reader.py
def unsafe_to_torch(self):
    """This is an unsafe method that returns the data as a Torch Tensor.

    Warning: This method requires that columns have homogeneous data types. Havinng
    mixed types will raise an error.
    """
    self.data.to_torch()
validate
validate()

Validate the path of the CSV file.

Raises:

Type Description
FedbiomedError

If the path is invalid

Source code in fedbiomed/common/dataset_reader/_csv_reader.py
def validate(self) -> None:
    """Validate the path of the CSV file.

    Raises:
        FedbiomedError: If the path is invalid
    """
    if not os.path.isfile(self._path):
        raise FedbiomedError(f"error: cannot find csv file {self._path}")

NiftiReader

Attributes

data_type class-attribute instance-attribute
data_type = Nifti1Image

Functions

read classmethod
read(path)

Reads the NIfTI file and returns it as a tensor, optionally transformed.

Parameters:

Name Type Description Default
path Union[str, Path]

Path to the NIfTI file (.nii or .nii.gz)

required

Returns:

Type Description
Nifti1Image

Union[torch.Tensor, np.ndarray, nib.Nifti1Image]: The image data in the specified format.

Raises: FileNotFoundError: If the file does not exist. ValueError: If the file is not a valid NIfTI file.

Source code in fedbiomed/common/dataset_reader/_nifti_reader.py
@classmethod
def read(cls, path: Union[str, Path]) -> nib.Nifti1Image:
    """Reads the NIfTI file and returns it as a tensor, optionally transformed.

    Args:
        path (Union[str, Path]): Path to the NIfTI file (.nii or .nii.gz)

    Returns:
        Union[torch.Tensor, np.ndarray, nib.Nifti1Image]: The image data in the specified format.
    Raises:
        FileNotFoundError: If the file does not exist.
        ValueError: If the file is not a valid NIfTI file.
    """
    # IMPORTANT: Read function does not return header. It may bne useful in the future.
    if isinstance(path, str):
        path = Path(path)
    elif not isinstance(path, Path):
        raise TypeError(f"Expected path to be a string or Path, got {type(path)}")

    path = Path(path)

    cls.validate(path)

    img = nib.load(str(path))

    return img
validate staticmethod
validate(path)

Validate the file path and extension.

Parameters:

Name Type Description Default
path Path

Path to the NIfTI file.

required
Source code in fedbiomed/common/dataset_reader/_nifti_reader.py
@staticmethod
def validate(path: Path) -> None:
    """Validate the file path and extension.

    Args:
        path (Path): Path to the NIfTI file.
    """
    if not path.exists():
        raise FileNotFoundError(f"NIfTI file does not exist: {path}")
    if not path.is_file():
        raise ValueError(f"Provided path is not a file: {path}")
    if path.suffix not in {".nii", ".gz"} and not path.name.endswith(".nii.gz"):
        raise ValueError(f"File must be .nii or .nii.gz: {path.name}")