Dataset controller

Classes that simplify imports from fedbiomed.common._dataset_controller

Classes

Controller

Controller()

Bases: ABC, DataLoadingPlanMixin

Source code in fedbiomed/common/dataloadingplan/_data_loading_plan.py
def __init__(self):
    self._dlp = None

Attributes

root property writable
root

Functions

get_sample abstractmethod
get_sample(index)

Retrieve a data sample without applying transforms

Source code in fedbiomed/common/dataset_controller/_controller.py
@abstractmethod
def get_sample(self, index: int) -> Dict[str, Any]:
    """Retrieve a data sample without applying transforms"""
    pass
get_types
get_types()

Get type directly from values in dict returned by get_sample

Source code in fedbiomed/common/dataset_controller/_controller.py
def get_types(self):
    """Get `type` directly from values in `dict` returned by `get_sample`"""
    return {_k: type(_v).__name__ for _k, _v in self.get_sample(0).items()}
shape
shape()

Get shape directly from values in dict returned by get_sample

Source code in fedbiomed/common/dataset_controller/_controller.py
def shape(self) -> Dict[str, Any]:
    """Get `shape` directly from values in `dict` returned by `get_sample`"""
    # Supported: int, float, dict, PIL.Image, None and obj.shape (if available)
    # This function can be overwritten for specific cases in child class
    sample = self.get_sample(0)

    if not isinstance(sample, dict):
        raise FedbiomedError(
            f"{ErrorNumbers.FB632.value}: Expected `sample` to be a `dict`, got "
            f"{type(sample).__name__}"
        )

    output = {}
    for key, val in sample.items():
        if hasattr(val, "shape"):
            output[key] = val.shape
        elif isinstance(val, (int, float)):
            output[key] = 1
        elif isinstance(val, dict):
            output[key] = len(val)
        elif isinstance(val, Image):
            output[key] = {"size": val.size, "mode": val.mode}
        elif val is None:
            output[key] = None
        else:
            raise FedbiomedError(
                f"{ErrorNumbers.FB632.value}: Not possible to get shape for value "
                f"of key: {key} that is type {type(val).__name__}"
            )
    return output
validate
validate()

Validates coherence of controller

Raises:

Type Description
FedbiomedError

if coherence issue is found

Source code in fedbiomed/common/dataset_controller/_controller.py
def validate(self) -> None:
    """Validates coherence of controller

    Raises:
        FedbiomedError: if coherence issue is found
    """
    return None

CustomController

CustomController(root, **kwargs)

Bases: Controller

Custom dataset controller for MNIST dataset

Source code in fedbiomed/common/dataset_controller/_custom_controller.py
def __init__(self, root, **kwargs):
    self.root = root

    self._controller_kwargs = {
        "root": str(self.root),
        **kwargs,
    }

Attributes

root instance-attribute
root = root

Functions

get_sample
get_sample(index)

Shouldn't be called as the custom controller does not need to get a sample.

It is handled by get_item in CustomDataset.

Source code in fedbiomed/common/dataset_controller/_custom_controller.py
def get_sample(self, index):
    """Shouldn't be called as the custom controller does not need to get a sample.

    It is handled by get_item in CustomDataset.
    """
    return None
get_types
get_types()

Controller does not know how to get samples.

Source code in fedbiomed/common/dataset_controller/_custom_controller.py
def get_types(self):
    """Controller does not know how to get samples."""
    pass
shape
shape()

Len and shape should never be used, as the dataset implements it specifically.

It is impossible for the custom controller to know the shape of the dataset.

Source code in fedbiomed/common/dataset_controller/_custom_controller.py
def shape(self):
    """Len and shape should never be used, as the dataset implements it specifically.

    It is impossible for the custom controller to know the shape of the dataset.
    """
    return None

ImageFolderController

ImageFolderController(root)

Bases: Controller

Generic ImageFolder where data is arranged like this: root ├── class_x │ ├── xxx.ext │ ├── xxy.ext │ └── ... ├── class_y │ ├── 123.ext │ └── ... └── ...

Parameters:

Name Type Description Default
root Union[str, Path]

Root directory path

required

Raises:

Type Description
FedbiomedError

if ImageFolder can not be initialized

Source code in fedbiomed/common/dataset_controller/_image_folder_controller.py
def __init__(self, root: Union[str, Path]) -> Dict[str, Any]:
    """Constructor of the class

    Args:
        root: Root directory path

    Raises:
        FedbiomedError: if `ImageFolder` can not be initialized
    """
    self.root = root
    try:
        _, self._class_to_idx = folder.find_classes(directory=self.root)
        self._samples = folder.make_dataset(
            directory=self.root,
            class_to_idx=self._class_to_idx,
            extensions=self._extensions,
            is_valid_file=self._is_valid_file,
            allow_empty=self._allow_empty,
        )
    except Exception as e:
        raise FedbiomedError(
            f"{ErrorNumbers.FB632.value}: "
            "Failed to instantiate ImageFolderDataset object"
        ) from e

    self._controller_kwargs = {
        "root": str(self.root),
    }

Attributes

root instance-attribute
root = root

Functions

get_sample
get_sample(index)

Retrieve a data sample without applying transforms

Source code in fedbiomed/common/dataset_controller/_image_folder_controller.py
def get_sample(self, index: int) -> Dict[str, Any]:
    """Retrieve a data sample without applying transforms"""
    try:
        path, target = self._samples[index]
        data = self._loader(path)
    except Exception as e:
        raise FedbiomedError(
            f"{ErrorNumbers.FB632.value}: Failed to retrieve item at index {index}"
        ) from e
    return {"data": data, "target": target}

MedNistController

MedNistController(root)

Bases: Controller

Generic data controller where the data is arranged in this way: root └── MedNIST ├── AbdomenCT │ ├── 000000.jpeg │ └── ... ├── BreastMRI/ ├── ChestCT/ ├── CXR/ ├── Hand/ └── HeadCT/

Parameters:

Name Type Description Default
root Union[str, Path]

Root directory path

required

Raises:

Type Description
FedbiomedError

if ImageFolder can not be initialized

Source code in fedbiomed/common/dataset_controller/_mednist_controller.py
def __init__(self, root: Union[str, Path]) -> Dict[str, Any]:
    """Constructor of the class

    Args:
        root: Root directory path

    Raises:
        FedbiomedError: if `ImageFolder` can not be initialized
    """
    self.root = root
    if not (self.root / "MedNIST").exists():
        download_mednist(self.root)

    try:
        self._dataset = ImageFolder(self.root / "MedNIST")
    except Exception as e:
        raise FedbiomedError(
            f"{ErrorNumbers.FB632.value}: "
            f"The following error raised while loading the data folder: {e}"
        ) from e

    self._controller_kwargs = {
        "root": str(self.root),
    }

Attributes

root instance-attribute
root = root

Functions

get_sample
get_sample(index)

Retrieve a data sample without applying transforms

Source code in fedbiomed/common/dataset_controller/_mednist_controller.py
def get_sample(self, index: int) -> Dict[str, Any]:
    """Retrieve a data sample without applying transforms"""
    try:
        data, target = self._dataset[index]
    except Exception as e:
        raise FedbiomedError(
            f"{ErrorNumbers.FB632.value}: Failed to retrieve item at index {index}"
        ) from e
    return {"data": data, "target": target}

MedicalFolderController

MedicalFolderController(root, tabular_file=None, index_col=None, dlp=None, validate=True)

Bases: Controller

Parameters:

Name Type Description Default
root Union[str, Path]

Root directory path

required
tabular_file Optional[Union[str, PathLike, Path]]

Path to CSV file containing the demographic information

None
index_col Optional[Union[int, str]]

Column name in tabular file containing the subjects names

None

Raises:

Type Description
FedbiomedError
Source code in fedbiomed/common/dataset_controller/_medical_folder_controller.py
def __init__(
    self,
    root: Union[str, Path],
    tabular_file: Optional[Union[str, PathLike, Path]] = None,
    index_col: Optional[Union[int, str]] = None,
    dlp: Optional[DataLoadingPlan] = None,
    validate: bool = True,
):
    """Constructor for class `MedicalFolder`

    Args:
        root: Root directory path
        tabular_file: Path to CSV file containing the demographic information
        index_col: Column name in tabular file containing the subjects names

    Raises:
        FedbiomedError:
        - if one in `tabular_file` and `index_col` is given and the other is not
    """
    DataLoadingPlanMixin.__init__(self)
    self.root = root
    self._tabular_file = self._validate_tabular_file(tabular_file)
    self._index_col = self._validate_index_col(index_col)

    # Folder structure <subject>/<modality>/<file> in DataFrame format
    self._df_dir = self._make_df_dir(root=self.root, extensions=self._extensions)

    # Demographics
    if (tabular_file is None) != (index_col is None):
        raise FedbiomedError(
            f"{ErrorNumbers.FB613.value}: "
            "Arguments `tabular_file` and `index_col`, both or none are expected"
        )
    self._demographics = (
        None
        if tabular_file is None
        else self.read_demographics(tabular_file, index_col)
    )

    # Data loading plan
    if dlp is not None:
        self.set_dlp(dlp)

    # Function 'validate' instantiates self._controller_kwargs
    if validate is True:
        self.validate()

Attributes

demographics property
demographics
df_dir property
df_dir

Returns a copy to ensure _df_dir is not modified

index_col property
index_col
modalities property
modalities

Returns keys of dict that maps modalities to folders

root instance-attribute
root = root
subjects property
subjects
tabular_file property
tabular_file

Functions

available_subjects
available_subjects(subjects_from_index, subjects_from_folder=None)

Checks missing subject folders and missing entries in demographics

Parameters:

Name Type Description Default
subjects_from_index Union[list, Series]

Given subject folder names in demographics

required
subjects_from_folder list

list of subject folder names

None

Returns:

Type Description
Dict[str, str]

Dict with next keys:

Dict[str, str]
  • missing_folders: subjects in demographics absent in folder structure
Dict[str, str]
  • missing_entries: subjects in folder structure absent in demographics
Dict[str, str]
  • intersection: subjects present in folder structure and demographics
Source code in fedbiomed/common/dataset_controller/_medical_folder_controller.py
def available_subjects(
    self,
    subjects_from_index: Union[list, pd.Series],
    subjects_from_folder: list = None,
) -> Dict[str, str]:
    """Checks missing subject folders and missing entries in demographics

    Args:
        subjects_from_index: Given subject folder names in demographics
        subjects_from_folder: list of subject folder names

    Returns:
        Dict with next keys:
        - missing_folders: subjects in demographics absent in folder structure
        - missing_entries: subjects in folder structure absent in demographics
        - intersection: subjects present in folder structure and demographics
    """
    # Select all subject folders if it is not given
    if subjects_from_folder is None:
        subjects_from_folder = self.subjects

    return {
        # Missing subject that will cause warnings
        "missing_folders": list(
            set(subjects_from_index).difference(subjects_from_folder)
        ),
        # Missing entries that will cause errors
        "missing_entries": list(
            set(subjects_from_folder).difference(subjects_from_index)
        ),
        # Intersection
        "intersection": list(
            set(subjects_from_index).intersection(subjects_from_folder)
        ),
    }
demographics_column_names
demographics_column_names(path)
Source code in fedbiomed/common/dataset_controller/_medical_folder_controller.py
def demographics_column_names(self, path: Union[str, Path]):
    return self.read_demographics(path).columns.values
get_dataset_type staticmethod
get_dataset_type()
Source code in fedbiomed/common/dataset_controller/_medical_folder_controller.py
@staticmethod
def get_dataset_type() -> DatasetTypes:
    return DatasetTypes.MEDICAL_FOLDER
get_sample
get_sample(index)

Retrieve a data sample without applying transforms

Source code in fedbiomed/common/dataset_controller/_medical_folder_controller.py
def get_sample(
    self, index: int
) -> Dict[str, NiftiReader.data_type | Dict[str, Any]]:
    """Retrieve a data sample without applying transforms"""
    if self._validated is False:
        self.validate()

    sample = self._samples[index]
    try:
        data = {
            modality: NiftiReader.read(sample[modality])
            for modality in self.modalities
        }
    except Exception as e:
        raise FedbiomedError(
            f"{ErrorNumbers.FB632.value}: Failed to retrieve item at index {index}"
        ) from e
    if "demographics" in sample:
        data["demographics"] = sample["demographics"]
    return data
read_demographics staticmethod
read_demographics(tabular_file, index_col=None)

Read demographics tabular file

Parameters:

Name Type Description Default
tabular_file Union[str, Path]

path to demographics file

required
index_col Optional[Union[int, str]]

Index column that matches . Defaults to None.

None

Raises:

Type Description
FedbiomedError

if the file can not be loaded

Returns:

Type Description
DataFrame

Demographics in DataFrame format

Source code in fedbiomed/common/dataset_controller/_medical_folder_controller.py
@staticmethod
def read_demographics(
    tabular_file: Union[str, Path],
    index_col: Optional[Union[int, str]] = None,
) -> pd.DataFrame:
    """Read demographics tabular file

    Args:
        tabular_file: path to demographics file
        index_col: Index column that matches <subject>. Defaults to None.

    Raises:
        FedbiomedError: if the file can not be loaded

    Returns:
        Demographics in DataFrame format
    """
    tabular_file = MedicalFolderController._validate_tabular_file(tabular_file)

    try:
        demographics = CsvReader(tabular_file).data.to_pandas()
        if index_col is not None:
            if isinstance(index_col, int):
                if index_col < 0 or index_col >= len(demographics.columns):
                    raise FedbiomedError(
                        f"{ErrorNumbers.FB613.value}: "
                        f"Index column {index_col} is out of bounds"
                    )
                index_col = demographics.columns[index_col]
            demographics = demographics.set_index(index_col)

    except FedbiomedError as e:
        raise FedbiomedError(
            f"{ErrorNumbers.FB613.value}: :"
            f"Can not load demographics tabular file. Error message is: {e}"
        ) from e

    length = len(demographics)
    logger.info(f"Number of rows in demographics file: {length}")

    # Keep the first one in duplicated subjects
    demographics = demographics.loc[~demographics.index.duplicated(keep="first")]
    if length != len(demographics):
        logger.info(f"Length of demographics for unique index {len(demographics)}")

    return demographics
set_dlp
set_dlp(dlp)

Ensures validation of controller object after set_dlp is executed

Source code in fedbiomed/common/dataset_controller/_medical_folder_controller.py
def set_dlp(self, dlp):
    """Ensures validation of controller object after set_dlp is executed"""
    DataLoadingPlanMixin.set_dlp(self, dlp)
    self.validate()
subject_modality_status
subject_modality_status(index=None)

Scans subjects and checks which modalities exist for each subject

Parameters:

Name Type Description Default
index Union[list, Series]

Array-like index that comes from reference csv file. It represents subject folder names. Defaults to None.

None

Returns: Modality status that indicates which modalities are available per subject

Source code in fedbiomed/common/dataset_controller/_medical_folder_controller.py
def subject_modality_status(self, index: Union[list, pd.Series] = None) -> Dict:
    """Scans subjects and checks which modalities exist for each subject

    Args:
        index: Array-like index that comes from reference csv file.
            It represents subject folder names. Defaults to None.
    Returns:
        Modality status that indicates which modalities are available per subject
    """
    # Pivot into wide format with boolean indicators
    df_ = (
        self.df_dir.assign(val=True)
        .pivot_table(
            index="subject",
            columns="modality",
            values="val",
            fill_value=0,
        )
        .astype(bool)
    )

    if index is not None:
        df_["in_folder"] = True
        # Merge with pivot (outer join)
        df_ = pd.merge(
            df_,
            pd.DataFrame(True, index=index, columns=["in_index"]),
            left_index=True,
            right_index=True,
            how="outer",
        )
        # Fill missing values with False
        df_ = df_.fillna(False)

    return {
        "columns": df_.columns.tolist(),
        "data": df_.values.tolist(),
        "index": df_.index.tolist(),
    }
validate
validate()
Source code in fedbiomed/common/dataset_controller/_medical_folder_controller.py
def validate(self) -> None:
    # Filter subjects to contain all modalities
    self._modalities, df_dir = self._prepare_df_dir_for_use(self.df_dir, self._dlp)
    # Generate list of samples: dict, with demographics and path to modalities
    self._subjects, self._samples = self._make_dataset(self.demographics, df_dir)
    # Change flag to be able to recover a sample
    self._validated = True
    # Check if is possible to use `reader` to recover a valid item
    _ = self.get_sample(0)

    self._controller_kwargs = {
        "root": str(self.root),
        "tabular_file": (
            None if self.tabular_file is None else str(self.tabular_file)
        ),
        "index_col": self.index_col,
        "dlp": self._dlp,
    }

MedicalFolderLoadingBlockTypes

MedicalFolderLoadingBlockTypes(*args)

Bases: DataLoadingBlockTypes, Enum

Source code in fedbiomed/common/constants.py
def __init__(self, *args):
    cls = self.__class__
    if not isinstance(self.value, str):
        raise ValueError(
            "all fields of DataLoadingBlockTypes subclasses must be of str type"
        )
    if any(self.value == e.value for e in cls):
        a = self.name
        e = cls(self.value).name
        raise ValueError(
            f"duplicate values not allowed in DataLoadingBlockTypes and "
            f"its subclasses: {a} --> {e}"
        )

Attributes

MODALITIES_TO_FOLDERS class-attribute instance-attribute
MODALITIES_TO_FOLDERS = 'modalities_to_folders'

MnistController

MnistController(root, train=True, download=True)

Bases: Controller

Generic Mnist controller where the data is arranged in this way: root └──MNIST └── raw ├── train-images-idx3-ubyte ├── train-labels-idx1-ubyte ├── t10k-images-idx3-ubyte └── t10k-labels-idx1-ubyte

Parameters:

Name Type Description Default
root Union[str, Path]

Root directory path

required
train bool

If true then train files are used

True
download bool

If true then downloads and extracts the files if they do not exist

True

Raises:

Type Description
FedbiomedError

if torchvision.datasets.MNIST can not be initialized

Source code in fedbiomed/common/dataset_controller/_mnist_controller.py
def __init__(
    self,
    root: Union[str, Path],
    train: bool = True,
    download: bool = True,
) -> Dict[str, Any]:
    """Constructor of the class

    Args:
        root: Root directory path
        train: If true then train files are used
        download: If true then downloads and extracts the files if they do not exist

    Raises:
        FedbiomedError: if `torchvision.datasets.MNIST` can not be initialized
    """
    self.root = root

    try:
        self._dataset = MNIST(root=self.root, train=train, download=download)
    except Exception as e:
        raise FedbiomedError(
            f"{ErrorNumbers.FB632.value}: "
            "Failed to instantiate MnistDataset object. {e}"
        ) from e

    self._controller_kwargs = {
        "root": str(self.root),
        "train": train,
        "download": False,
    }

Attributes

root instance-attribute
root = root

Functions

get_sample
get_sample(index)

Retrieve a data sample without applying transforms

Source code in fedbiomed/common/dataset_controller/_mnist_controller.py
def get_sample(self, index: int) -> Dict[str, Any]:
    """Retrieve a data sample without applying transforms"""
    try:
        data, target = self._dataset[index]
    except Exception as e:
        raise FedbiomedError(
            f"{ErrorNumbers.FB632.value}: Failed to retrieve item at index {index}"
        ) from e
    return {"data": data, "target": target}

TabularController

TabularController(root)

Bases: Controller

Parameters:

Name Type Description Default
root Union[str, Path]

Root directory path

required

Raises:

Type Description
FedbiomedError

if root does not exist

Source code in fedbiomed/common/dataset_controller/_tabular_controller.py
def __init__(
    self,
    root: Union[str, Path],
) -> None:
    """Constructor of the class

    Args:
        root: Root directory path

    Raises:
        FedbiomedError: if `root` does not exist
    """
    self.root = root
    self._reader = CsvReader(self.root)
    self._controller_kwargs = {
        "root": str(self.root),
    }

Attributes

root instance-attribute
root = root

Functions

get_sample
get_sample(index)

Retrieve a data sample without applying transforms

Source code in fedbiomed/common/dataset_controller/_tabular_controller.py
def get_sample(self, index: int) -> pl.DataFrame:
    """Retrieve a data sample without applying transforms"""
    if index >= self.__len__():
        raise FedbiomedError(
            f"{ErrorNumbers.FB632.value}: Failed to retrieve item at index {index}"
        )
    return self._reader.get(index)
get_types
get_types()

Get dtypes of the columns in the Tabular dataset

Source code in fedbiomed/common/dataset_controller/_tabular_controller.py
def get_types(self):
    """Get dtypes of the columns in the Tabular dataset"""
    return {
        col: dtype.__class__.__name__
        for col, dtype in self._reader.data.schema.items()
    }
normalize_columns
normalize_columns(columns)

Validate and normalize columns to a list of column names

Parameters:

Name Type Description Default
columns Union[Iterable, int, str]

Columns to normalize

required

Raises:

Type Description
FedbiomedError

if columns is not valid

Returns:

Type Description
list[str]

List of column names

Source code in fedbiomed/common/dataset_controller/_tabular_controller.py
def normalize_columns(self, columns: Union[Iterable, int, str]) -> list[str]:
    """Validate and normalize `columns` to a list of column names

    Args:
        columns: Columns to normalize

    Raises:
        FedbiomedError: if `columns` is not valid

    Returns:
        List of column names
    """
    return self._reader.normalize_columns(columns=columns)
shape
shape()
Source code in fedbiomed/common/dataset_controller/_tabular_controller.py
def shape(self) -> Dict:
    return self._reader.shape()