Dataset reader

Classes that simplify imports from fedbiomed.common.dataset_reader

Classes

CsvReader

CsvReader(path, has_header='auto', delimiter=None)

Parameters:

Name	Type	Description	Default
`path`	`Path`	The path of the csv file that contains the dataset.	required
`has_header`	`str \| bool`	Boolean to indicate whether the file has a header or not. By default it is set as 'auto', which is the case that the reader tries to detect itself whether the file has a header or not.	`'auto'`
`delimiter`	`Optional[str]`	The delimiter used in the csv file. By default it is set as None, which is the case that the reader tries to detect itself whether the file has a delimiter or not.	`None`

Source code in fedbiomed/common/dataset_reader/_csv_reader.py

def __init__(
    self,
    path: Path,
    has_header: str | bool = "auto",
    delimiter: Optional[str] = None,
) -> None:
    """Constructs the csv reader.

    Args:
        path: The path of the csv file that contains the dataset.
        has_header: Boolean to indicate whether the file has a header or not.
            By default it is set as 'auto', which is the case that the reader tries to
            detect itself whether the file has a header or not.
        delimiter: The delimiter used in the csv file.
            By default it is set as None, which is the case that the reader tries to
            detect itself whether the file has a delimiter or not.
    """

    self._path = path

    self._delimiter = delimiter
    self.header: bool | None = None if has_header == "auto" else has_header

    # Pre-parse the CSV file to determine its delimiter and header
    # Note: this will read the first line of the file
    self._pre_parse()

    # Initialize the data and the column names
    self.data = self._read()
    self.columns = list(self.data.columns)

    # Initialize shape and length
    # Defer costly operations
    self._shape = self.data.shape
    self._len = self._shape[0]

Attributes

columns `instance-attribute`

columns = list(columns)

data `instance-attribute`

data = _read()

header = None if has_header == 'auto' else has_header

Functions

get

get(indexes, columns=None)

Gets the specified rows and columns in the dataset.

Parameters:

Name	Type	Description	Default
`indexes`	`int \| Iterable`	Row indexes to retrieve.	required
`columns`	`Optional[Iterable \| int \| str]`	(Optional) list of columns to retrieve.	`None`

Returns: Polars DataFrame: The specified dataframe.

Source code in fedbiomed/common/dataset_reader/_csv_reader.py

def get(
    self,
    indexes: int | Iterable,
    columns: Optional[Iterable | int | str] = None,
) -> pl.DataFrame:
    """Gets the specified rows and columns in the dataset.

    Args:
        indexes: Row indexes to retrieve.
        columns: (Optional) list of columns to retrieve.
    Returns:
        Polars DataFrame: The specified dataframe.
    """
    return self._get_entry(indexes=indexes, columns=columns)

len

len()

Get number of samples

Source code in fedbiomed/common/dataset_reader/_csv_reader.py

def len(self) -> int:
    """Get number of samples"""
    return self._len

normalize_columns

normalize_columns(columns)

Validates columns and returns them in type list

Parameters:

Name	Type	Description	Default
`columns`	`Iterable \| int \| str`	Columns to extract, can be an iterable of column names, a single column name, or an integer index.	required

Returns: A list of column names.

Raises:

Type	Description
`FedbiomedUserInputError`	If the input does not match the types expected

Source code in fedbiomed/common/dataset_reader/_csv_reader.py

def normalize_columns(self, columns: Iterable | int | str) -> list[str]:
    """Validates `columns` and returns them in type `list`

    Args:
        columns: Columns to extract, can be an iterable of column names, a single
            column name, or an integer index.
    Returns:
        A list of column names.

    Raises:
        FedbiomedUserInputError: If the input does not match the types expected
    """

    if isinstance(columns, str) or isinstance(columns, int):
        columns = [columns]

    # if columns is a list of int, convert it to a list of column names
    # (auto-generated by polars as column_0, column_1, etc. if there is no header)
    if all(isinstance(item, int) for item in columns):
        n_cols = len(self.columns)
        _inter = list(filter(lambda x: 0 > x or x > n_cols, columns))
        if any(_inter):
            raise FedbiomedUserInputError(
                f"Column index(es) {_inter} is out of range (0 to {n_cols - 1})"
            )
        columns = [self.columns[i] for i in columns]
    _faulty_col = list(filter(lambda x: x not in self.columns, columns))
    if any(_faulty_col):
        msg = f"Cannot read columns {_faulty_col}: file does not contain those columns specified"
        raise FedbiomedUserInputError(msg)

    return columns

shape

shape()

Returns the shape of the csv dataset.

Computed before applying transforms or conversion to other format.

Returns:

Type	Description
	Dictionary with the shape and other necessary info for the dataset

Source code in fedbiomed/common/dataset_reader/_csv_reader.py

def shape(self):
    """Returns the shape of the csv dataset.

    Computed before applying transforms or conversion to other format.

    Returns:
        Dictionary with the shape and other necessary info for the dataset
    """
    return {"csv": self._shape}

to_numpy

to_numpy()

Returns the data as a Numpy ndarray.

Source code in fedbiomed/common/dataset_reader/_csv_reader.py

def to_numpy(self):
    """Returns the data as a Numpy ndarray."""
    return self.data.to_numpy()

to_pandas

to_pandas()

Returns the data as a Pandas Dataframe.

Source code in fedbiomed/common/dataset_reader/_csv_reader.py

def to_pandas(self) -> pd.DataFrame:
    """Returns the data as a Pandas Dataframe."""
    return self.data.to_pandas()

unsafe_to_torch

unsafe_to_torch()

This is an unsafe method that returns the data as a Torch Tensor.

Warning: This method requires that columns have homogeneous data types. Havinng mixed types will raise an error.

Source code in fedbiomed/common/dataset_reader/_csv_reader.py

def unsafe_to_torch(self):
    """This is an unsafe method that returns the data as a Torch Tensor.

    Warning: This method requires that columns have homogeneous data types. Havinng
    mixed types will raise an error.
    """
    self.data.to_torch()

validate

validate()

Validate the path of the CSV file.

Raises:

Type	Description
`FedbiomedError`	If the path is invalid

Source code in fedbiomed/common/dataset_reader/_csv_reader.py

def validate(self) -> None:
    """Validate the path of the CSV file.

    Raises:
        FedbiomedError: If the path is invalid
    """
    if not os.path.isfile(self._path):
        raise FedbiomedError(f"error: cannot find csv file {self._path}")

NiftiReader

Attributes

data_type `class-attribute` `instance-attribute`

data_type = Nifti1Image

Functions

read `classmethod`

read(path)

Reads the NIfTI file and returns it as a tensor, optionally transformed.

Parameters:

Name	Type	Description	Default
`path`	`Union[str, Path]`	Path to the NIfTI file (.nii or .nii.gz)	required

Returns:

Type	Description
`Nifti1Image`	Union[torch.Tensor, np.ndarray, nib.Nifti1Image]: The image data in the specified format.

Raises: FileNotFoundError: If the file does not exist. ValueError: If the file is not a valid NIfTI file.

Source code in fedbiomed/common/dataset_reader/_nifti_reader.py

@classmethod
def read(cls, path: Union[str, Path]) -> nib.Nifti1Image:
    """Reads the NIfTI file and returns it as a tensor, optionally transformed.

    Args:
        path (Union[str, Path]): Path to the NIfTI file (.nii or .nii.gz)

    Returns:
        Union[torch.Tensor, np.ndarray, nib.Nifti1Image]: The image data in the specified format.
    Raises:
        FileNotFoundError: If the file does not exist.
        ValueError: If the file is not a valid NIfTI file.
    """
    # IMPORTANT: Read function does not return header. It may bne useful in the future.
    if isinstance(path, str):
        path = Path(path)
    elif not isinstance(path, Path):
        raise TypeError(f"Expected path to be a string or Path, got {type(path)}")

    path = Path(path)

    cls.validate(path)

    img = nib.load(str(path))

    return img

validate `staticmethod`

validate(path)

Validate the file path and extension.

Parameters:

Name	Type	Description	Default
`path`	`Path`	Path to the NIfTI file.	required

Source code in fedbiomed/common/dataset_reader/_nifti_reader.py

@staticmethod
def validate(path: Path) -> None:
    """Validate the file path and extension.

    Args:
        path (Path): Path to the NIfTI file.
    """
    if not path.exists():
        raise FileNotFoundError(f"NIfTI file does not exist: {path}")
    if not path.is_file():
        raise ValueError(f"Provided path is not a file: {path}")
    if path.suffix not in {".nii", ".gz"} and not path.name.endswith(".nii.gz"):
        raise ValueError(f"File must be .nii or .nii.gz: {path.name}")

Classes

CsvReader

Attributes

columns instance-attribute

data instance-attribute

header instance-attribute

Functions

get

len

normalize_columns

shape

to_numpy

to_pandas

unsafe_to_torch

validate

NiftiReader

Attributes

data_type class-attribute instance-attribute

Functions

read classmethod

validate staticmethod

columns `instance-attribute`

data `instance-attribute`

header `instance-attribute`

data_type `class-attribute` `instance-attribute`

read `classmethod`

validate `staticmethod`