Dataset

Classes that simplify imports from fedbiomed.common.dataset

Attributes

DATASET_CLASSES_PER_TYPE module-attribute

DATASET_CLASSES_PER_TYPE = {CUSTOM: CustomDataset, IMAGES: ImageFolderDataset, MEDICAL_FOLDER: MedicalFolderDataset, MEDNIST: MedNistDataset, DEFAULT: MnistDataset, TABULAR: TabularDataset}

REGISTRY_CONTROLLERS module-attribute

REGISTRY_CONTROLLERS = {TABULAR: (TabularController, ControllerParametersBase, DATASET_CLASSES_PER_TYPE[TABULAR]), MEDICAL_FOLDER: (MedicalFolderController, MedicalFolderParameters, DATASET_CLASSES_PER_TYPE[MEDICAL_FOLDER]), IMAGES: (ImageFolderController, ControllerParametersBase, DATASET_CLASSES_PER_TYPE[IMAGES]), DEFAULT: (MnistController, ControllerParametersBase, DATASET_CLASSES_PER_TYPE[DEFAULT]), MEDNIST: (MedNistController, ControllerParametersBase, DATASET_CLASSES_PER_TYPE[MEDNIST]), CUSTOM: (CustomController, ControllerParametersBase, DATASET_CLASSES_PER_TYPE[CUSTOM])}

Classes

ControllerParametersBase dataclass

ControllerParametersBase(root)

Attributes

root instance-attribute
root

Functions

from_dict classmethod
from_dict(data)
Source code in fedbiomed/common/dataset/_mappings.py
@classmethod
def from_dict(cls, data: dict):
    field_names = {f.name for f in fields(cls)}
    filtered = {k: v for k, v in data.items() if k in field_names}
    return cls(**filtered)
to_dict
to_dict()

Convert entry to dictionary - removes None values

Source code in fedbiomed/common/dataset/_mappings.py
def to_dict(self) -> dict:
    """Convert entry to dictionary - removes None values"""
    return {k: v for k, v in asdict(self).items() if v is not None}

CustomDataset

Bases: Dataset

A class representing a custom dataset.

This class allows users to create and manage their own datasets for use in federated learning scenarios.

Functions

complete_initialization
complete_initialization(controller_kwargs, to_format)

Finalize initialization of object to be able to recover items.

Parameters:

Name Type Description Default
controller_kwargs Dict[str, Any]

must contain a "root" key with the path to the dataset.

required
to_format DataReturnFormat

expected format of data returned by __getitem__.

required
Source code in fedbiomed/common/dataset/_custom_dataset.py
def complete_initialization(
    self, controller_kwargs: Dict[str, Any], to_format: DataReturnFormat
) -> None:
    """Finalize initialization of object to be able to recover items.

    Args:
        controller_kwargs: must contain a ``"root"`` key with the path to the dataset.
        to_format: expected format of data returned by ``__getitem__``.
    """

    self.path = controller_kwargs.get("root", None)
    if self.path is None:
        raise FedbiomedError(
            f"{ErrorNumbers.FB632.value}: Custom Dataset ERROR: 'root' must be provided in controller_kwargs to specify dataset location."
        )
    self._to_format = to_format

    # Call user defined read function to read the dataset
    try:
        self.read()
    except Exception as e:
        raise FedbiomedError(
            f"{ErrorNumbers.FB632.value}: Failed to read "
            f"from dataset using read method. Please see error: {e}"
        ) from e

    if len(self) == 0:
        raise FedbiomedError(
            f"{ErrorNumbers.FB632.value}: Custom Dataset ERROR: dataset is empty (len == 0)."
        )

    try:
        sample = self.get_item(0)
    except Exception as e:
        raise FedbiomedError(
            f"{ErrorNumbers.FB632.value}: Failed to retrieve item "
            f"from dataset using get_item method. Please see error: {e}"
        ) from e

    if not isinstance(sample, tuple) or len(sample) != 2:
        raise FedbiomedError(
            f"{ErrorNumbers.FB632.value}: get_item method must return a tuple of two elements"
            f" (data, target), but got {type(sample).__name__} with"
            f" length {len(sample) if isinstance(sample, (list, tuple)) else 'N/A'}"
        )

    data, target = sample
    self._check_type(data, "data")
    self._check_type(target, "target")
get_item abstractmethod
get_item(index)

Return a (data, target) tuple for the given index.

Parameters:

Name Type Description Default
index int

Index of the sample to retrieve.

required
Source code in fedbiomed/common/dataset/_custom_dataset.py
@abstractmethod
def get_item(self, index):
    """Return a (data, target) tuple for the given index.

    Args:
        index (int): Index of the sample to retrieve.
    """
    pass
read abstractmethod
read()

Reads the dataset from the specified path.

This method should be implemented by subclasses to load the dataset from the given path and prepare it for use.

Source code in fedbiomed/common/dataset/_custom_dataset.py
@abstractmethod
def read(self) -> None:
    """Reads the dataset from the specified path.

    This method should be implemented by subclasses to load the dataset
    from the given path and prepare it for use.
    """
    pass

Dataset

Bases: ABC

Attributes

target_transform class-attribute instance-attribute
target_transform = None
to_format property writable
to_format
transform class-attribute instance-attribute
transform = None

Functions

apply_transforms
apply_transforms(sample)

Apply transforms to sample in place

Parameters:

Name Type Description Default
sample Dict[str, Any]

sample returned by self._controller.get_sample

required

Raises:

Type Description
FedbiomedError

if there is a problem applying transform or target_transform

Source code in fedbiomed/common/dataset/_dataset.py
def apply_transforms(self, sample: Dict[str, Any]) -> Dict[str, Any]:
    """Apply transforms to sample in place

    Args:
        sample: sample returned by `self._controller.get_sample`

    Raises:
        FedbiomedError: if there is a problem applying `transform` or `target_transform`
    """
    try:
        sample["data"] = self._transform(
            self._get_default_types_callable()(
                self._get_format_conversion_callable()(sample["data"])
            )
        )
    except Exception as e:
        raise FedbiomedError(
            f"{ErrorNumbers.FB632.value}: Failed to apply `transform` to `data` "
            f"in sample in {self._to_format.value} format."
        ) from e

    try:
        sample["data"] = self._get_default_types_callable()(sample["data"])
    except Exception as e:
        raise FedbiomedError(
            f"{ErrorNumbers.FB632.value}: Failed to apply default training plan types to `data` "
            f"in sample in {self._to_format.value} format."
        ) from e

    if sample.get("target") is not None:
        try:
            sample["target"] = self._target_transform(
                self._get_default_types_callable()(
                    self._get_format_conversion_callable()(sample["target"])
                )
            )
        except Exception as e:
            raise FedbiomedError(
                f"{ErrorNumbers.FB632.value}: Failed to apply `target_transform` to "
                f"`target` in sample in {self._to_format.value} format."
            ) from e

        try:
            sample["target"] = self._get_default_types_callable()(sample["target"])
        except Exception as e:
            raise FedbiomedError(
                f"{ErrorNumbers.FB632.value}: Failed to apply default training plan types to `target` "
                f"in sample in {self._to_format.value} format."
            ) from e

    return sample
complete_initialization abstractmethod
complete_initialization(controller_kwargs, to_format)

Finalize initialization of object to be able to recover items

Source code in fedbiomed/common/dataset/_dataset.py
@abstractmethod
def complete_initialization(
    self,
    controller_kwargs: Dict[str, Any],
    to_format: DataReturnFormat,
) -> None:
    """Finalize initialization of object to be able to recover items"""
    # Recover sample and validate consistency of transforms
    pass
compute_stats
compute_stats(dataset_schema=None, stats=None, stats_args=None)

Computes statistics over the dataset using the AnalyticsOrchestrator.

Parameters:

Name Type Description Default
schema_args

Selection arguments to filter the schema (e.g. subset of columns/keys).

required
stats Optional[List[str]]

List of statistics names to compute (e.g. ['mean', 'std']). If None or empty, default statistics are chosen based on data type.

None
stats_args Optional[Dict[str, Any]]

Specific arguments for statistics, structured matching the schema.

None

Returns:

Type Description
Any

Computed statistics structure.

Raises:

Type Description
FedbiomedError

If the dataset does not support analytics (missing get_schema_for_analytics).

Source code in fedbiomed/common/dataset/_dataset.py
def compute_stats(
    self,
    dataset_schema: Optional[Union[str, List[str], Dict[str, Any]]] = None,
    stats: Optional[List[str]] = None,
    stats_args: Optional[Dict[str, Any]] = None,
) -> Any:
    """Computes statistics over the dataset using the AnalyticsOrchestrator.

    Args:
        schema_args: Selection arguments to filter the schema (e.g. subset of columns/keys).
        stats: List of statistics names to compute (e.g. ['mean', 'std']).
               If None or empty, default statistics are chosen based on data type.
        stats_args: Specific arguments for statistics, structured matching the schema.

    Returns:
        Computed statistics structure.

    Raises:
        FedbiomedError: If the dataset does not support analytics (missing get_schema_for_analytics).
    """
    orchestrator = AnalyticsOrchestrator()
    return orchestrator.compute_stats(
        self,
        dataset_schema=dataset_schema,
        stats=stats,
        stats_args=stats_args,
    )

ImageFolderDataset

ImageFolderDataset(transform=None, target_transform=None)

Bases: _ImageLabelDataset

Source code in fedbiomed/common/dataset/_image_label_dataset.py
def __init__(
    self,
    transform: Optional[Callable] = None,
    target_transform: Optional[Callable] = None,
):
    if type(self) is _ImageLabelDataset:
        raise FedbiomedValueError(
            f"{ErrorNumbers.FB632.value}: "
            "`_ImageLabelDataset` cannot be instantiated directly"
        )
    self._transform = self._validate_transform(transform)
    self._target_transform = self._validate_transform(target_transform)

MedNistDataset

MedNistDataset(transform=None, target_transform=None)

Bases: _ImageLabelDataset

Source code in fedbiomed/common/dataset/_image_label_dataset.py
def __init__(
    self,
    transform: Optional[Callable] = None,
    target_transform: Optional[Callable] = None,
):
    if type(self) is _ImageLabelDataset:
        raise FedbiomedValueError(
            f"{ErrorNumbers.FB632.value}: "
            "`_ImageLabelDataset` cannot be instantiated directly"
        )
    self._transform = self._validate_transform(transform)
    self._target_transform = self._validate_transform(target_transform)

MedicalFolderDataset

MedicalFolderDataset(data_modalities, target_modalities=None, transform=None, target_transform=None)

Bases: Dataset

Parameters:

Name Type Description Default
data_modalities Union[str, Iterable[str]]

The data modalities to use.

required
target_modalities Optional[Union[str, Iterable[str]]]

The target modalities to use.

None
transform Transform

The transform to apply to the data. Defaults to None.

None
target_transform Transform

The transform to apply to the target data. Defaults to None.

None

Raises:

Type Description
FedbiomedValueError
Source code in fedbiomed/common/dataset/_medical_folder_dataset.py
def __init__(
    self,
    data_modalities: Union[str, Iterable[str]],
    target_modalities: Optional[Union[str, Iterable[str]]] = None,
    transform: Transform = None,
    target_transform: Transform = None,
):
    """Initializes the MedicalFolderDataset.

    Args:
        data_modalities (Union[str, Iterable[str]]): The data modalities to use.
        target_modalities (Optional[Union[str, Iterable[str]]]): The target modalities to use.
        transform (Transform, optional): The transform to apply to the data. Defaults to None.
        target_transform (Transform, optional): The transform to apply to the target data. Defaults to None.

    Raises:
        FedbiomedValueError:
        - If the input modalities are not valid.
        - If `data_modalities` is empty.
        - If `target_transform` is given but `target_modalities` is None\
    """
    if not data_modalities:
        raise FedbiomedValueError(
            f"{ErrorNumbers.FB632.value}: `data_modalities` cannot be empty"
        )

    self._data_modalities = self._normalize_modalities(data_modalities)
    self._target_modalities = (
        None
        if target_modalities is None
        else self._normalize_modalities(target_modalities)
    )

    self._transform = self._validate_transform(
        transform=transform,
        modalities=self._data_modalities,
    )

    if self._target_modalities is None:
        if target_transform is not None:
            raise FedbiomedValueError(
                f"{ErrorNumbers.FB632.value}: `target_transform` provided but "
                "`target_modalities` is None"
            )
        else:
            self._target_transform = None
    else:
        self._target_transform = self._validate_transform(
            transform=target_transform,
            modalities=self._target_modalities,
        )

Attributes

data_modalities property
data_modalities

Returns the data modalities of the dataset.

demographics_columns property
demographics_columns

Returns the columns of the dataset if 'demographics' modality is present, else None.

target_modalities property
target_modalities

Returns the target modalities of the dataset, or None if not defined.

Functions

analytics_schema
analytics_schema()

Return schema associated with federated analytics.

Source code in fedbiomed/common/dataset/_medical_folder_dataset.py
def analytics_schema(self):
    """Return schema associated with federated analytics."""
    if self._controller is None:
        raise FedbiomedError(
            f"{ErrorNumbers.FB632.value}: Dataset object has not completed "
            "initialization. It is not ready to use yet."
        )
    schema = {}

    # Add demographics schema if available
    if self.demographics_columns is not None:
        schema["demographics"] = RowSpec(columns=self.demographics_columns)

    # Add image schema for all other modalities
    schema.update(
        {
            modality: ImageSpec()
            for modality in self._data_modalities
            if modality != "demographics"
        }
    )
    return schema, None
complete_initialization
complete_initialization(controller_kwargs, to_format)

Finalize initialization of object to be able to recover items

Parameters:

Name Type Description Default
controller_kwargs Dict[str, Any]

arguments to create controller

required
to_format DataReturnFormat

format associated to expected return format

required
Source code in fedbiomed/common/dataset/_medical_folder_dataset.py
def complete_initialization(
    self,
    controller_kwargs: Dict[str, Any],
    to_format: DataReturnFormat,
) -> None:
    """Finalize initialization of object to be able to recover items

    Args:
        controller_kwargs: arguments to create controller
        to_format: format associated to expected return format
    """
    self.to_format = to_format
    self._init_controller(controller_kwargs=controller_kwargs)

    # Recover sample and validate consistency of transforms
    sample = self._controller.get_sample(0)
    self._validate_format_and_transformations(
        {modality: sample[modality] for modality in self._data_modalities},
        transform=self._transform,
    )
    if self._target_modalities is not None:
        self._validate_format_and_transformations(
            {modality: sample[modality] for modality in self._target_modalities},
            transform=self._target_transform,
            is_target=True,
        )

MedicalFolderParameters dataclass

MedicalFolderParameters(root, tabular_file=None, index_col=None, dlp=None)

Bases: ControllerParametersBase

Attributes

dlp class-attribute instance-attribute
dlp = None
index_col class-attribute instance-attribute
index_col = None
tabular_file class-attribute instance-attribute
tabular_file = None

MnistDataset

MnistDataset(transform=None, target_transform=None)

Bases: _ImageLabelDataset

Source code in fedbiomed/common/dataset/_image_label_dataset.py
def __init__(
    self,
    transform: Optional[Callable] = None,
    target_transform: Optional[Callable] = None,
):
    if type(self) is _ImageLabelDataset:
        raise FedbiomedValueError(
            f"{ErrorNumbers.FB632.value}: "
            "`_ImageLabelDataset` cannot be instantiated directly"
        )
    self._transform = self._validate_transform(transform)
    self._target_transform = self._validate_transform(target_transform)

NativeDataset

NativeDataset(dataset, target=None)

Bases: Dataset

A class representing a native dataset.

This class wraps around datasets from popular ML libraries like PyTorch and scikit-learn, allowing them to be used seamlessly in a customized TrainingPlan for FedBiomed.

Parameters:

Name Type Description Default
dataset

Native dataset object from a ML library (e.g., PyTorch, scikit-learn).

required
target Optional[Any]

Optional target data if not included in the dataset.

None

Raises: FedbiomedError: if dataset does not implement collection interface, or if target length does not match dataset length, or if both dataset and argument provide targets.

Source code in fedbiomed/common/dataset/_native_dataset.py
def __init__(self, dataset, target: Optional[Any] = None):
    """Initialize with basic checks, without loading data to memory.

    Args:
        dataset: Native dataset object from a ML library (e.g., PyTorch, scikit-learn).
        target: Optional target data if not included in the dataset.
    Raises:
        FedbiomedError: if dataset does not implement collection interface,
            or if target length does not match dataset length,
            or if both dataset and argument provide targets.
    """
    # Check collection interface
    if not hasattr(dataset, "__len__") or not hasattr(dataset, "__getitem__"):
        raise FedbiomedError(
            f"{ErrorNumbers.FB632.value}: Dataset must implement __len__ and __getitem__."
        )

    self._dataset = dataset

    # Probe one sample to determine supervised/unsupervised shape
    try:
        sample = dataset[0]
    except Exception as e:
        raise FedbiomedError(
            f"{ErrorNumbers.FB632.value}: Failed to get a sample item from dataset. Details: {e}"
        ) from e

    self._is_supervised = isinstance(sample, tuple)

    # If both dataset and argument provide targets -> conflict
    if self._is_supervised and target is not None:
        raise FedbiomedError(
            f"{ErrorNumbers.FB632.value}: Target found both in dataset and in 'target' argument."
        )

    # Raise an error if length of target does not match dataset length
    if (
        target is not None
        and hasattr(target, "__len__")
        and len(target) != len(dataset)
    ):
        raise FedbiomedError(
            f"{ErrorNumbers.FB632.value}: Length of target ({len(target)}) does not match dataset ({len(dataset)})."
        )

    self._target = target  # may be None

Functions

complete_initialization
complete_initialization(controller_kwargs, to_format)

Select data and target, and check if they can be converted to requested format.

Parameters:

Name Type Description Default
controller_kwargs Dict[str, Any]

keyword arguments for controller (not used here).

required
to_format DataReturnFormat

format associated to expected return format.

required

Raises: FedbiomedError: if there is a problem converting dataset items to requested format.

Source code in fedbiomed/common/dataset/_native_dataset.py
def complete_initialization(
    self,
    controller_kwargs: Dict[str, Any],
    to_format: DataReturnFormat,
) -> None:
    """Select data and target, and check if they can be converted to requested format.

    Args:
        controller_kwargs: keyword arguments for controller (not used here).
        to_format: format associated to expected return format.
    Raises:
        FedbiomedError: if there is a problem converting dataset items to requested format.
    """

    self._to_format = to_format
    self._converter = self._get_format_conversion_callable()

    if self._is_supervised:
        data, target = self._dataset[0]
    elif self._target is not None:
        data = self._dataset[0]
        target = self._target[0]
    else:
        data = self._dataset[0]
        target = None

    try:
        self._validate_format_conversion(data)
    except FedbiomedError as e:
        raise FedbiomedError(
            f"{ErrorNumbers.FB632.value}: Failed to convert dataset items to "
            f"requested format {to_format}. Details: {e}"
        ) from e

    if target is not None:
        try:
            self._validate_format_conversion(target)
        except FedbiomedError as e:
            raise FedbiomedError(
                f"{ErrorNumbers.FB632.value}: Failed to convert dataset items to "
                f"requested format {to_format}. Details: {e}"
            ) from e

TabularDataset

TabularDataset(input_columns, target_columns=None, transform=None, target_transform=None)

Bases: Dataset

Parameters:

Name Type Description Default
input_columns Iterable | int | str

Columns to be used as input features

required
target_columns Optional[Iterable | int | str]

Columns to be used as target

None
transform Optional[Callable]

Transformation to be applied to input features

None
target_transform Optional[Callable]

Transformation to be applied to target

None

Raises: FedbiomedValueError: if input_columns or target_columns are not valid FedbiomedValueError: if transform or target_transform are not valid callables

Source code in fedbiomed/common/dataset/_tabular_dataset.py
def __init__(
    self,
    input_columns: Iterable | int | str,
    target_columns: Optional[Iterable | int | str] = None,
    transform: Optional[Callable] = None,
    target_transform: Optional[Callable] = None,
) -> None:
    """Constructor of the class

    Args:
        input_columns: Columns to be used as input features
        target_columns: Columns to be used as target
        transform: Transformation to be applied to input features
        target_transform: Transformation to be applied to target
    Raises:
        FedbiomedValueError: if `input_columns` or `target_columns` are not valid
        FedbiomedValueError: if `transform` or `target_transform` are not valid callables
    """

    # Transformation checks
    self._transform = self._validate_transform(transform=transform)
    self._target_transform = self._validate_transform(transform=target_transform)

    # Validation of columns is deferred to complete_initialization
    # as self._controller._reader implements the logic to validate columns
    self._input_columns = input_columns
    self._target_columns = target_columns

Functions

analytics_schema
analytics_schema()

Return schema for federated analytics

Source code in fedbiomed/common/dataset/_tabular_dataset.py
def analytics_schema(self):
    """Return schema for federated analytics"""
    return RowSpec(columns=self._input_columns), None
complete_initialization
complete_initialization(controller_kwargs, to_format)

Finalize initialization of object to be able to recover items

Parameters:

Name Type Description Default
controller_kwargs Dict[str, Any]

arguments to create controller

required
to_format DataReturnFormat

format associated to expected return format

required
Source code in fedbiomed/common/dataset/_tabular_dataset.py
def complete_initialization(
    self,
    controller_kwargs: Dict[str, Any],
    to_format: DataReturnFormat,
) -> None:
    """Finalize initialization of object to be able to recover items

    Args:
        controller_kwargs: arguments to create controller
        to_format: format associated to expected return format
    """
    self.to_format = to_format

    self._init_controller(controller_kwargs=controller_kwargs)

    # Normalize columns using controller (implies validation)
    self._input_columns = self._controller.normalize_columns(self._input_columns)
    if self._target_columns is not None:
        self._target_columns = self._controller.normalize_columns(
            self._target_columns
        )
        # Check for overlap between input_columns and target_columns
        _intersection_cols = list(
            set(self._input_columns) & set(self._target_columns)
        )
        if _intersection_cols:
            logger.warning(
                f"Columns {_intersection_cols} are present in both input_columns and target_columns."
            )

    sample = self._controller.get_sample(0)  # type: ignore

    n_rows, _ = sample.shape
    if n_rows > 1:
        raise FedbiomedError(
            f"{ErrorNumbers.FB632.value}: TabularDataset currently only supports "
            "row-wise samples. Sample obtained from controller has multiple rows."
        )

    self._validate_format_and_transformations(
        self._get_item_from_sample(sample, self._input_columns),
        transform=self._transform,
    )
    if self._target_columns is not None:
        self._validate_format_and_transformations(
            self._get_item_from_sample(sample, self._target_columns),
            transform=self._transform,
        )

Functions

get_controller

get_controller(data_type, controller_parameters)

Get controller instance based on data_type and dataset_parameters

Source code in fedbiomed/common/dataset/_mappings.py
def get_controller(
    data_type: str,
    controller_parameters: dict,
) -> Controller:
    """Get controller instance based on data_type and dataset_parameters"""
    # Validate that data_type is implemented.
    data_type_: Optional[DatasetTypes] = DatasetTypes.get_type_by_value(data_type)
    if not data_type_ or data_type_ not in REGISTRY_CONTROLLERS:
        raise FedbiomedError(
            f"{ErrorNumbers.FB632.value}: "
            f"Unknown 'data_type', implemented are: {list(REGISTRY_CONTROLLERS.keys())}"
        )

    controller_class, parameters_class, _ = REGISTRY_CONTROLLERS[data_type_]

    # Validate and instantiate parameters
    try:
        parameters_instance = parameters_class.from_dict(controller_parameters)
    except Exception as e:
        raise FedbiomedError(
            f"{ErrorNumbers.FB632.value}: Failed to parse dataset_parameters: {str(e)}"
        ) from e

    try:
        return controller_class(**parameters_instance.to_dict())
    except FedbiomedError:
        raise
    except Exception as e:
        raise FedbiomedError(
            f"{ErrorNumbers.FB632.value}: Unhandled exception occurred: {str(e)}"
        ) from e