Classes that simplify imports from fedbiomed.common.dataset
Attributes
DATASET_CLASSES_PER_TYPE module-attribute
DATASET_CLASSES_PER_TYPE = {CUSTOM: CustomDataset, IMAGES: ImageFolderDataset, MEDICAL_FOLDER: MedicalFolderDataset, MEDNIST: MedNistDataset, DEFAULT: MnistDataset, TABULAR: TabularDataset}
REGISTRY_CONTROLLERS module-attribute
REGISTRY_CONTROLLERS = {TABULAR: (TabularController, ControllerParametersBase, DATASET_CLASSES_PER_TYPE[TABULAR]), MEDICAL_FOLDER: (MedicalFolderController, MedicalFolderParameters, DATASET_CLASSES_PER_TYPE[MEDICAL_FOLDER]), IMAGES: (ImageFolderController, ControllerParametersBase, DATASET_CLASSES_PER_TYPE[IMAGES]), DEFAULT: (MnistController, ControllerParametersBase, DATASET_CLASSES_PER_TYPE[DEFAULT]), MEDNIST: (MedNistController, ControllerParametersBase, DATASET_CLASSES_PER_TYPE[MEDNIST]), CUSTOM: (CustomController, ControllerParametersBase, DATASET_CLASSES_PER_TYPE[CUSTOM])}
Classes
ControllerParametersBase dataclass
ControllerParametersBase(root)
Attributes
root instance-attribute
root
Functions
from_dict classmethod
from_dict(data)
Source code in fedbiomed/common/dataset/_mappings.py
@classmethod
def from_dict(cls, data: dict):
field_names = {f.name for f in fields(cls)}
filtered = {k: v for k, v in data.items() if k in field_names}
return cls(**filtered)
to_dict
to_dict()
Convert entry to dictionary - removes None values
Source code in fedbiomed/common/dataset/_mappings.py
def to_dict(self) -> dict:
"""Convert entry to dictionary - removes None values"""
return {k: v for k, v in asdict(self).items() if v is not None}
CustomDataset
Bases: Dataset
A class representing a custom dataset.
This class allows users to create and manage their own datasets for use in federated learning scenarios.
Functions
complete_initialization
complete_initialization(controller_kwargs, to_format)
Finalize initialization of object to be able to recover items.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
controller_kwargs | Dict[str, Any] | must contain a | required |
to_format | DataReturnFormat | expected format of data returned by | required |
Source code in fedbiomed/common/dataset/_custom_dataset.py
def complete_initialization(
self, controller_kwargs: Dict[str, Any], to_format: DataReturnFormat
) -> None:
"""Finalize initialization of object to be able to recover items.
Args:
controller_kwargs: must contain a ``"root"`` key with the path to the dataset.
to_format: expected format of data returned by ``__getitem__``.
"""
self.path = controller_kwargs.get("root", None)
if self.path is None:
raise FedbiomedError(
f"{ErrorNumbers.FB632.value}: Custom Dataset ERROR: 'root' must be provided in controller_kwargs to specify dataset location."
)
self._to_format = to_format
# Call user defined read function to read the dataset
try:
self.read()
except Exception as e:
raise FedbiomedError(
f"{ErrorNumbers.FB632.value}: Failed to read "
f"from dataset using read method. Please see error: {e}"
) from e
if len(self) == 0:
raise FedbiomedError(
f"{ErrorNumbers.FB632.value}: Custom Dataset ERROR: dataset is empty (len == 0)."
)
try:
sample = self.get_item(0)
except Exception as e:
raise FedbiomedError(
f"{ErrorNumbers.FB632.value}: Failed to retrieve item "
f"from dataset using get_item method. Please see error: {e}"
) from e
if not isinstance(sample, tuple) or len(sample) != 2:
raise FedbiomedError(
f"{ErrorNumbers.FB632.value}: get_item method must return a tuple of two elements"
f" (data, target), but got {type(sample).__name__} with"
f" length {len(sample) if isinstance(sample, (list, tuple)) else 'N/A'}"
)
data, target = sample
self._check_type(data, "data")
self._check_type(target, "target")
get_item abstractmethod
get_item(index)
Return a (data, target) tuple for the given index.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
index | int | Index of the sample to retrieve. | required |
Source code in fedbiomed/common/dataset/_custom_dataset.py
@abstractmethod
def get_item(self, index):
"""Return a (data, target) tuple for the given index.
Args:
index (int): Index of the sample to retrieve.
"""
pass
read abstractmethod
read()
Reads the dataset from the specified path.
This method should be implemented by subclasses to load the dataset from the given path and prepare it for use.
Source code in fedbiomed/common/dataset/_custom_dataset.py
@abstractmethod
def read(self) -> None:
"""Reads the dataset from the specified path.
This method should be implemented by subclasses to load the dataset
from the given path and prepare it for use.
"""
pass
Dataset
Bases: ABC
Attributes
target_transform class-attribute instance-attribute
target_transform = None
to_format property writable
to_format
transform class-attribute instance-attribute
transform = None
Functions
apply_transforms
apply_transforms(sample)
Apply transforms to sample in place
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
sample | Dict[str, Any] | sample returned by | required |
Raises:
| Type | Description |
|---|---|
FedbiomedError | if there is a problem applying |
Source code in fedbiomed/common/dataset/_dataset.py
def apply_transforms(self, sample: Dict[str, Any]) -> Dict[str, Any]:
"""Apply transforms to sample in place
Args:
sample: sample returned by `self._controller.get_sample`
Raises:
FedbiomedError: if there is a problem applying `transform` or `target_transform`
"""
try:
sample["data"] = self._transform(
self._get_default_types_callable()(
self._get_format_conversion_callable()(sample["data"])
)
)
except Exception as e:
raise FedbiomedError(
f"{ErrorNumbers.FB632.value}: Failed to apply `transform` to `data` "
f"in sample in {self._to_format.value} format."
) from e
try:
sample["data"] = self._get_default_types_callable()(sample["data"])
except Exception as e:
raise FedbiomedError(
f"{ErrorNumbers.FB632.value}: Failed to apply default training plan types to `data` "
f"in sample in {self._to_format.value} format."
) from e
if sample.get("target") is not None:
try:
sample["target"] = self._target_transform(
self._get_default_types_callable()(
self._get_format_conversion_callable()(sample["target"])
)
)
except Exception as e:
raise FedbiomedError(
f"{ErrorNumbers.FB632.value}: Failed to apply `target_transform` to "
f"`target` in sample in {self._to_format.value} format."
) from e
try:
sample["target"] = self._get_default_types_callable()(sample["target"])
except Exception as e:
raise FedbiomedError(
f"{ErrorNumbers.FB632.value}: Failed to apply default training plan types to `target` "
f"in sample in {self._to_format.value} format."
) from e
return sample
complete_initialization abstractmethod
complete_initialization(controller_kwargs, to_format)
Finalize initialization of object to be able to recover items
Source code in fedbiomed/common/dataset/_dataset.py
@abstractmethod
def complete_initialization(
self,
controller_kwargs: Dict[str, Any],
to_format: DataReturnFormat,
) -> None:
"""Finalize initialization of object to be able to recover items"""
# Recover sample and validate consistency of transforms
pass
compute_stats
compute_stats(dataset_schema=None, stats=None, stats_args=None)
Computes statistics over the dataset using the AnalyticsOrchestrator.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
schema_args | Selection arguments to filter the schema (e.g. subset of columns/keys). | required | |
stats | Optional[List[str]] | List of statistics names to compute (e.g. ['mean', 'std']). If None or empty, default statistics are chosen based on data type. | None |
stats_args | Optional[Dict[str, Any]] | Specific arguments for statistics, structured matching the schema. | None |
Returns:
| Type | Description |
|---|---|
Any | Computed statistics structure. |
Raises:
| Type | Description |
|---|---|
FedbiomedError | If the dataset does not support analytics (missing get_schema_for_analytics). |
Source code in fedbiomed/common/dataset/_dataset.py
def compute_stats(
self,
dataset_schema: Optional[Union[str, List[str], Dict[str, Any]]] = None,
stats: Optional[List[str]] = None,
stats_args: Optional[Dict[str, Any]] = None,
) -> Any:
"""Computes statistics over the dataset using the AnalyticsOrchestrator.
Args:
schema_args: Selection arguments to filter the schema (e.g. subset of columns/keys).
stats: List of statistics names to compute (e.g. ['mean', 'std']).
If None or empty, default statistics are chosen based on data type.
stats_args: Specific arguments for statistics, structured matching the schema.
Returns:
Computed statistics structure.
Raises:
FedbiomedError: If the dataset does not support analytics (missing get_schema_for_analytics).
"""
orchestrator = AnalyticsOrchestrator()
return orchestrator.compute_stats(
self,
dataset_schema=dataset_schema,
stats=stats,
stats_args=stats_args,
)
ImageFolderDataset
ImageFolderDataset(transform=None, target_transform=None)
Bases: _ImageLabelDataset
Source code in fedbiomed/common/dataset/_image_label_dataset.py
def __init__(
self,
transform: Optional[Callable] = None,
target_transform: Optional[Callable] = None,
):
if type(self) is _ImageLabelDataset:
raise FedbiomedValueError(
f"{ErrorNumbers.FB632.value}: "
"`_ImageLabelDataset` cannot be instantiated directly"
)
self._transform = self._validate_transform(transform)
self._target_transform = self._validate_transform(target_transform)
MedNistDataset
MedNistDataset(transform=None, target_transform=None)
Bases: _ImageLabelDataset
Source code in fedbiomed/common/dataset/_image_label_dataset.py
def __init__(
self,
transform: Optional[Callable] = None,
target_transform: Optional[Callable] = None,
):
if type(self) is _ImageLabelDataset:
raise FedbiomedValueError(
f"{ErrorNumbers.FB632.value}: "
"`_ImageLabelDataset` cannot be instantiated directly"
)
self._transform = self._validate_transform(transform)
self._target_transform = self._validate_transform(target_transform)
MedicalFolderDataset
MedicalFolderDataset(data_modalities, target_modalities=None, transform=None, target_transform=None)
Bases: Dataset
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
data_modalities | Union[str, Iterable[str]] | The data modalities to use. | required |
target_modalities | Optional[Union[str, Iterable[str]]] | The target modalities to use. | None |
transform | Transform | The transform to apply to the data. Defaults to None. | None |
target_transform | Transform | The transform to apply to the target data. Defaults to None. | None |
Raises:
| Type | Description |
|---|---|
FedbiomedValueError | |
Source code in fedbiomed/common/dataset/_medical_folder_dataset.py
def __init__(
self,
data_modalities: Union[str, Iterable[str]],
target_modalities: Optional[Union[str, Iterable[str]]] = None,
transform: Transform = None,
target_transform: Transform = None,
):
"""Initializes the MedicalFolderDataset.
Args:
data_modalities (Union[str, Iterable[str]]): The data modalities to use.
target_modalities (Optional[Union[str, Iterable[str]]]): The target modalities to use.
transform (Transform, optional): The transform to apply to the data. Defaults to None.
target_transform (Transform, optional): The transform to apply to the target data. Defaults to None.
Raises:
FedbiomedValueError:
- If the input modalities are not valid.
- If `data_modalities` is empty.
- If `target_transform` is given but `target_modalities` is None\
"""
if not data_modalities:
raise FedbiomedValueError(
f"{ErrorNumbers.FB632.value}: `data_modalities` cannot be empty"
)
self._data_modalities = self._normalize_modalities(data_modalities)
self._target_modalities = (
None
if target_modalities is None
else self._normalize_modalities(target_modalities)
)
self._transform = self._validate_transform(
transform=transform,
modalities=self._data_modalities,
)
if self._target_modalities is None:
if target_transform is not None:
raise FedbiomedValueError(
f"{ErrorNumbers.FB632.value}: `target_transform` provided but "
"`target_modalities` is None"
)
else:
self._target_transform = None
else:
self._target_transform = self._validate_transform(
transform=target_transform,
modalities=self._target_modalities,
)
Attributes
data_modalities property
data_modalities
Returns the data modalities of the dataset.
demographics_columns property
demographics_columns
Returns the columns of the dataset if 'demographics' modality is present, else None.
target_modalities property
target_modalities
Returns the target modalities of the dataset, or None if not defined.
Functions
analytics_schema
analytics_schema()
Return schema associated with federated analytics.
Source code in fedbiomed/common/dataset/_medical_folder_dataset.py
def analytics_schema(self):
"""Return schema associated with federated analytics."""
if self._controller is None:
raise FedbiomedError(
f"{ErrorNumbers.FB632.value}: Dataset object has not completed "
"initialization. It is not ready to use yet."
)
schema = {}
# Add demographics schema if available
if self.demographics_columns is not None:
schema["demographics"] = RowSpec(columns=self.demographics_columns)
# Add image schema for all other modalities
schema.update(
{
modality: ImageSpec()
for modality in self._data_modalities
if modality != "demographics"
}
)
return schema, None
complete_initialization
complete_initialization(controller_kwargs, to_format)
Finalize initialization of object to be able to recover items
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
controller_kwargs | Dict[str, Any] | arguments to create controller | required |
to_format | DataReturnFormat | format associated to expected return format | required |
Source code in fedbiomed/common/dataset/_medical_folder_dataset.py
def complete_initialization(
self,
controller_kwargs: Dict[str, Any],
to_format: DataReturnFormat,
) -> None:
"""Finalize initialization of object to be able to recover items
Args:
controller_kwargs: arguments to create controller
to_format: format associated to expected return format
"""
self.to_format = to_format
self._init_controller(controller_kwargs=controller_kwargs)
# Recover sample and validate consistency of transforms
sample = self._controller.get_sample(0)
self._validate_format_and_transformations(
{modality: sample[modality] for modality in self._data_modalities},
transform=self._transform,
)
if self._target_modalities is not None:
self._validate_format_and_transformations(
{modality: sample[modality] for modality in self._target_modalities},
transform=self._target_transform,
is_target=True,
)
MedicalFolderParameters dataclass
MedicalFolderParameters(root, tabular_file=None, index_col=None, dlp=None)
Bases: ControllerParametersBase
Attributes
dlp class-attribute instance-attribute
dlp = None
index_col class-attribute instance-attribute
index_col = None
tabular_file class-attribute instance-attribute
tabular_file = None
MnistDataset
MnistDataset(transform=None, target_transform=None)
Bases: _ImageLabelDataset
Source code in fedbiomed/common/dataset/_image_label_dataset.py
def __init__(
self,
transform: Optional[Callable] = None,
target_transform: Optional[Callable] = None,
):
if type(self) is _ImageLabelDataset:
raise FedbiomedValueError(
f"{ErrorNumbers.FB632.value}: "
"`_ImageLabelDataset` cannot be instantiated directly"
)
self._transform = self._validate_transform(transform)
self._target_transform = self._validate_transform(target_transform)
NativeDataset
NativeDataset(dataset, target=None)
Bases: Dataset
A class representing a native dataset.
This class wraps around datasets from popular ML libraries like PyTorch and scikit-learn, allowing them to be used seamlessly in a customized TrainingPlan for FedBiomed.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
dataset | Native dataset object from a ML library (e.g., PyTorch, scikit-learn). | required | |
target | Optional[Any] | Optional target data if not included in the dataset. | None |
Raises: FedbiomedError: if dataset does not implement collection interface, or if target length does not match dataset length, or if both dataset and argument provide targets.
Source code in fedbiomed/common/dataset/_native_dataset.py
def __init__(self, dataset, target: Optional[Any] = None):
"""Initialize with basic checks, without loading data to memory.
Args:
dataset: Native dataset object from a ML library (e.g., PyTorch, scikit-learn).
target: Optional target data if not included in the dataset.
Raises:
FedbiomedError: if dataset does not implement collection interface,
or if target length does not match dataset length,
or if both dataset and argument provide targets.
"""
# Check collection interface
if not hasattr(dataset, "__len__") or not hasattr(dataset, "__getitem__"):
raise FedbiomedError(
f"{ErrorNumbers.FB632.value}: Dataset must implement __len__ and __getitem__."
)
self._dataset = dataset
# Probe one sample to determine supervised/unsupervised shape
try:
sample = dataset[0]
except Exception as e:
raise FedbiomedError(
f"{ErrorNumbers.FB632.value}: Failed to get a sample item from dataset. Details: {e}"
) from e
self._is_supervised = isinstance(sample, tuple)
# If both dataset and argument provide targets -> conflict
if self._is_supervised and target is not None:
raise FedbiomedError(
f"{ErrorNumbers.FB632.value}: Target found both in dataset and in 'target' argument."
)
# Raise an error if length of target does not match dataset length
if (
target is not None
and hasattr(target, "__len__")
and len(target) != len(dataset)
):
raise FedbiomedError(
f"{ErrorNumbers.FB632.value}: Length of target ({len(target)}) does not match dataset ({len(dataset)})."
)
self._target = target # may be None
Functions
complete_initialization
complete_initialization(controller_kwargs, to_format)
Select data and target, and check if they can be converted to requested format.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
controller_kwargs | Dict[str, Any] | keyword arguments for controller (not used here). | required |
to_format | DataReturnFormat | format associated to expected return format. | required |
Raises: FedbiomedError: if there is a problem converting dataset items to requested format.
Source code in fedbiomed/common/dataset/_native_dataset.py
def complete_initialization(
self,
controller_kwargs: Dict[str, Any],
to_format: DataReturnFormat,
) -> None:
"""Select data and target, and check if they can be converted to requested format.
Args:
controller_kwargs: keyword arguments for controller (not used here).
to_format: format associated to expected return format.
Raises:
FedbiomedError: if there is a problem converting dataset items to requested format.
"""
self._to_format = to_format
self._converter = self._get_format_conversion_callable()
if self._is_supervised:
data, target = self._dataset[0]
elif self._target is not None:
data = self._dataset[0]
target = self._target[0]
else:
data = self._dataset[0]
target = None
try:
self._validate_format_conversion(data)
except FedbiomedError as e:
raise FedbiomedError(
f"{ErrorNumbers.FB632.value}: Failed to convert dataset items to "
f"requested format {to_format}. Details: {e}"
) from e
if target is not None:
try:
self._validate_format_conversion(target)
except FedbiomedError as e:
raise FedbiomedError(
f"{ErrorNumbers.FB632.value}: Failed to convert dataset items to "
f"requested format {to_format}. Details: {e}"
) from e
TabularDataset
TabularDataset(input_columns, target_columns=None, transform=None, target_transform=None)
Bases: Dataset
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
input_columns | Iterable | int | str | Columns to be used as input features | required |
target_columns | Optional[Iterable | int | str] | Columns to be used as target | None |
transform | Optional[Callable] | Transformation to be applied to input features | None |
target_transform | Optional[Callable] | Transformation to be applied to target | None |
Raises: FedbiomedValueError: if input_columns or target_columns are not valid FedbiomedValueError: if transform or target_transform are not valid callables
Source code in fedbiomed/common/dataset/_tabular_dataset.py
def __init__(
self,
input_columns: Iterable | int | str,
target_columns: Optional[Iterable | int | str] = None,
transform: Optional[Callable] = None,
target_transform: Optional[Callable] = None,
) -> None:
"""Constructor of the class
Args:
input_columns: Columns to be used as input features
target_columns: Columns to be used as target
transform: Transformation to be applied to input features
target_transform: Transformation to be applied to target
Raises:
FedbiomedValueError: if `input_columns` or `target_columns` are not valid
FedbiomedValueError: if `transform` or `target_transform` are not valid callables
"""
# Transformation checks
self._transform = self._validate_transform(transform=transform)
self._target_transform = self._validate_transform(transform=target_transform)
# Validation of columns is deferred to complete_initialization
# as self._controller._reader implements the logic to validate columns
self._input_columns = input_columns
self._target_columns = target_columns
Functions
analytics_schema
analytics_schema()
Return schema for federated analytics
Source code in fedbiomed/common/dataset/_tabular_dataset.py
def analytics_schema(self):
"""Return schema for federated analytics"""
return RowSpec(columns=self._input_columns), None
complete_initialization
complete_initialization(controller_kwargs, to_format)
Finalize initialization of object to be able to recover items
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
controller_kwargs | Dict[str, Any] | arguments to create controller | required |
to_format | DataReturnFormat | format associated to expected return format | required |
Source code in fedbiomed/common/dataset/_tabular_dataset.py
def complete_initialization(
self,
controller_kwargs: Dict[str, Any],
to_format: DataReturnFormat,
) -> None:
"""Finalize initialization of object to be able to recover items
Args:
controller_kwargs: arguments to create controller
to_format: format associated to expected return format
"""
self.to_format = to_format
self._init_controller(controller_kwargs=controller_kwargs)
# Normalize columns using controller (implies validation)
self._input_columns = self._controller.normalize_columns(self._input_columns)
if self._target_columns is not None:
self._target_columns = self._controller.normalize_columns(
self._target_columns
)
# Check for overlap between input_columns and target_columns
_intersection_cols = list(
set(self._input_columns) & set(self._target_columns)
)
if _intersection_cols:
logger.warning(
f"Columns {_intersection_cols} are present in both input_columns and target_columns."
)
sample = self._controller.get_sample(0) # type: ignore
n_rows, _ = sample.shape
if n_rows > 1:
raise FedbiomedError(
f"{ErrorNumbers.FB632.value}: TabularDataset currently only supports "
"row-wise samples. Sample obtained from controller has multiple rows."
)
self._validate_format_and_transformations(
self._get_item_from_sample(sample, self._input_columns),
transform=self._transform,
)
if self._target_columns is not None:
self._validate_format_and_transformations(
self._get_item_from_sample(sample, self._target_columns),
transform=self._transform,
)
Functions
get_controller
get_controller(data_type, controller_parameters)
Get controller instance based on data_type and dataset_parameters
Source code in fedbiomed/common/dataset/_mappings.py
def get_controller(
data_type: str,
controller_parameters: dict,
) -> Controller:
"""Get controller instance based on data_type and dataset_parameters"""
# Validate that data_type is implemented.
data_type_: Optional[DatasetTypes] = DatasetTypes.get_type_by_value(data_type)
if not data_type_ or data_type_ not in REGISTRY_CONTROLLERS:
raise FedbiomedError(
f"{ErrorNumbers.FB632.value}: "
f"Unknown 'data_type', implemented are: {list(REGISTRY_CONTROLLERS.keys())}"
)
controller_class, parameters_class, _ = REGISTRY_CONTROLLERS[data_type_]
# Validate and instantiate parameters
try:
parameters_instance = parameters_class.from_dict(controller_parameters)
except Exception as e:
raise FedbiomedError(
f"{ErrorNumbers.FB632.value}: Failed to parse dataset_parameters: {str(e)}"
) from e
try:
return controller_class(**parameters_instance.to_dict())
except FedbiomedError:
raise
except Exception as e:
raise FedbiomedError(
f"{ErrorNumbers.FB632.value}: Unhandled exception occurred: {str(e)}"
) from e