Classes that simplify imports from fedbiomed.common.datamanager
Classes
DataManager
DataManager(dataset, target=None, **kwargs)
Bases: object
Factory class that builds different data loaders
Data loader type is based on the framework of the training plan.
If dataset is not yet a Dataset, it also wraps it in a NativeDataset object.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
dataset | Union[Dataset, Any] | Either an already structured | required |
target | Optional[Any] | Target component of unformatted dataset, or | None |
**kwargs | dict | Additional parameters that are going to be used for data loader | {} |
Raises:
| Type | Description |
|---|---|
FedbiomedError | using targets with structured dataset |
FedbiomedError | cannot create a native dataset from unformatted data |
Source code in fedbiomed/common/datamanager/_data_manager.py
def __init__(
self, dataset: Union[Dataset, Any], target: Optional[Any] = None, **kwargs: dict
) -> None:
"""Constructor of DataManager,
Args:
dataset: Either an already structured `Dataset` or the data component of
unformatted dataset
target: Target component of unformatted dataset, or `None` for an already
structured dataset
**kwargs: Additional parameters that are going to be used for data loader
Raises:
FedbiomedError: using targets with structured dataset
FedbiomedError: cannot create a native dataset from unformatted data
"""
# no type check needed, kwargs are dict
self._loader_arguments = kwargs
if isinstance(dataset, Dataset):
if target is not None:
raise FedbiomedError(
f"{ErrorNumbers.FB632.value}: cannot use `target` argument "
f"when using a formatted dataset. Targets are already part of the "
f"`Dataset` argument"
)
else:
dataset = NativeDataset(dataset, target)
self._dataset = dataset
Functions
complete_dataset_initialization
complete_dataset_initialization(controller_kwargs)
Finalizes initialization of the DataManager's dataset controller
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
controller_kwargs | Dict[str, Any] | arguments for the controller | required |
Raises:
| Type | Description |
|---|---|
FedbiomedError | if |
FedbiomedError | if there is a problem completing dataset initialization |
Source code in fedbiomed/common/datamanager/_data_manager.py
def complete_dataset_initialization(
self, controller_kwargs: Dict[str, Any]
) -> None:
"""Finalizes initialization of the DataManager's dataset controller
Args:
controller_kwargs: arguments for the controller
Raises:
FedbiomedError: if `_data_manager_instance` is not initialized
FedbiomedError: if there is a problem completing dataset initialization
"""
if not self._data_manager_instance:
raise FedbiomedError(
f"{ErrorNumbers.FB632.value}: Data manager instance is not initialized. "
f"Please call `load()` first."
)
try:
self._dataset.complete_initialization(
controller_kwargs,
_dm_to_format[self._data_manager_instance.__class__],
)
except FedbiomedError as e:
raise e
except Exception as e:
raise FedbiomedError(
f"{ErrorNumbers.FB632.value}: Unable to complete dataset initialization."
) from e
extend_loader_args
extend_loader_args(extension)
Extends the class' loader arguments
Extends the class's _loader_arguments attribute with additional key-values from the extension argument. If a key already exists in the _loader_arguments, then it is not replaced.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
extension | Optional[Dict] | the mapping used to extend the loader arguments | required |
Source code in fedbiomed/common/datamanager/_data_manager.py
def extend_loader_args(self, extension: Optional[Dict]) -> None:
"""Extends the class' loader arguments
Extends the class's `_loader_arguments` attribute with additional key-values from
the `extension` argument. If a key already exists in the `_loader_arguments`, then
it is not replaced.
Args:
extension: the mapping used to extend the loader arguments
"""
if extension:
self._loader_arguments.update(
{
key: value
for key, value in extension.items()
if key not in self._loader_arguments
}
)
load
load(tp_type)
Loads proper DataManager based on given TrainingPlan and dataset, target attributes.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
tp_type | TrainingPlans | Enumeration instance of TrainingPlans that stands for type of training plan. | required |
Raises:
| Type | Description |
|---|---|
FedbiomedError | unknown training plan type |
Source code in fedbiomed/common/datamanager/_data_manager.py
def load(self, tp_type: TrainingPlans) -> None:
"""Loads proper DataManager based on given TrainingPlan and
`dataset`, `target` attributes.
Args:
tp_type: Enumeration instance of TrainingPlans that stands for type of training plan.
Raises:
FedbiomedError: unknown training plan type
"""
if tp_type in _tp_to_datamanager:
self._data_manager_instance = _tp_to_datamanager[tp_type](
dataset=self._dataset, **self._loader_arguments
)
else:
raise FedbiomedError(
f"{ErrorNumbers.FB632.value}: Unknown training plan type, "
"cannot instantiate data manager."
)
FrameworkDataManager
FrameworkDataManager(dataset, **kwargs)
Bases: ABC
Class for creating data loaders from dataset depending on training plans
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
dataset | Dataset | dataset object | required |
**kwargs | dict | arguments for data loader | {} |
Raises:
| Type | Description |
|---|---|
FedbiomedError | Bad argument type |
Source code in fedbiomed/common/datamanager/_framework_data_manager.py
@abstractmethod
def __init__(self, dataset: Dataset, **kwargs: dict):
"""Class constructor
Args:
dataset: dataset object
**kwargs: arguments for data loader
Raises:
FedbiomedError: Bad argument type
"""
if not isinstance(dataset, Dataset):
raise FedbiomedError(
f"{ErrorNumbers.FB632.value}: The argument `dataset` should be a "
f"`fedbiomed.common.Dataset` object"
)
self._dataset = dataset
self._loader_arguments = kwargs
Attributes
Functions
load_state
load_state(state)
Loads state of the data loader
It currently keep only testing index, training index and test ratio as state.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
state | Dict | Object containing data loader state. | required |
Source code in fedbiomed/common/datamanager/_framework_data_manager.py
def load_state(self, state: Dict):
"""Loads state of the data loader
It currently keep only testing index, training index and test ratio
as state.
Args:
state: Object containing data loader state.
"""
self._testing_index = state.get("testing_index", [])
self._training_index = state.get("training_index", [])
self._test_ratio = state.get("test_ratio", None)
save_state
save_state()
Gets state of the data loader.
Returns:
| Type | Description |
|---|---|
Dict | A Dict containing data loader state. |
Source code in fedbiomed/common/datamanager/_framework_data_manager.py
def save_state(self) -> Dict:
"""Gets state of the data loader.
Returns:
A Dict containing data loader state.
"""
data_manager_state = {}
data_manager_state["training_index"] = self._training_index
data_manager_state["testing_index"] = self._testing_index
data_manager_state["test_ratio"] = self._test_ratio
return data_manager_state
split
split(test_ratio, test_batch_size, is_shuffled_testing_dataset=False)
Split Dataset into train and validation dataloaders.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
test_ratio | float | Split ratio for validation set ratio. Rest of the samples will be used for training | required |
test_batch_size | Optional[int] | Batch size to use for testing subset | required |
is_shuffled_testing_dataset | bool | if True, randomly select different samples for the testing subset at each execution. If False, reuse previous split when possible. | False |
Raises: FedbiomedError: Arguments bad format FedbiomedError: Cannot get number of samples from dataset
Returns:
| Name | Type | Description |
|---|---|---|
train_loader | Optional[DataLoader] | DataLoader for training subset. |
test_loader | Optional[DataLoader] | DataLoader for validation subset. |
Source code in fedbiomed/common/datamanager/_framework_data_manager.py
def split(
self,
test_ratio: float,
test_batch_size: Optional[int],
is_shuffled_testing_dataset: bool = False,
) -> Tuple[Optional[DataLoader], Optional[DataLoader]]:
"""Split Dataset into train and validation dataloaders.
Args:
test_ratio: Split ratio for validation set ratio. Rest of the samples will be used for training
test_batch_size: Batch size to use for testing subset
is_shuffled_testing_dataset: if True, randomly select different samples for the testing
subset at each execution. If False, reuse previous split when possible.
Raises:
FedbiomedError: Arguments bad format
FedbiomedError: Cannot get number of samples from dataset
Returns:
train_loader: DataLoader for training subset. `None` if the `test_ratio` is `1`
test_loader: DataLoader for validation subset. `None` if the `test_ratio` is `0`
"""
# No need to check is_shuffled_testing_dataset, any argument can be interpreted as bool
# Check the type of argument test_batch_size
if not isinstance(test_batch_size, int) and test_batch_size is not None:
raise FedbiomedError(
f"{ErrorNumbers.FB632.value}: The argument `test_batch_size` should be "
f"type `int` or `None` not {type(test_batch_size)}"
)
# Check the argument `ratio` is of type `float`
if not isinstance(test_ratio, (float, int)):
raise FedbiomedError(
f"{ErrorNumbers.FB632.value}: The argument `test_ratio` should be "
f"type `float` or `int` not {type(test_ratio)}"
)
# Check ratio is valid for splitting
if test_ratio < 0 or test_ratio > 1:
raise FedbiomedError(
f"{ErrorNumbers.FB632.value}: The argument `test_ratio` should be "
f"equal or between 0 and 1, not {test_ratio}"
)
# Wrap dataset in framework specific class if needed
framework_dataset = self._dataset_wrapper(self._dataset)
try:
samples = len(framework_dataset)
except AttributeError as e:
raise FedbiomedError(
f"{ErrorNumbers.FB632.value}: Can not get number of samples from "
f"{str(self._dataset)} due to undefined attribute, {str(e)}"
) from e
except TypeError as e:
raise FedbiomedError(
f"{ErrorNumbers.FB632.value}: Can not get number of samples from "
f"{str(self._dataset)}, {str(e)}"
) from e
if self._test_ratio != test_ratio and self._test_ratio is not None:
if not is_shuffled_testing_dataset:
logger.info(
"`test_ratio` value has changed: this will change the testing dataset"
)
is_shuffled_testing_dataset = True
_is_loading_failed: bool = False
# Calculate number of samples for train and validation subsets
test_samples = math.floor(samples * test_ratio)
train_samples = samples - test_samples
if self._testing_index and not is_shuffled_testing_dataset:
try:
self._load_indexes(
framework_dataset, self._training_index, self._testing_index
)
_is_loading_failed = False
except IndexError:
_is_loading_failed = True
need_new_split = (
not self._testing_index or is_shuffled_testing_dataset or _is_loading_failed
)
if need_new_split:
if self._loader_arguments.get("shuffle", True):
# Random split (shuffled)
self._subset_train, self._subset_test = self._random_split(
framework_dataset,
[train_samples, test_samples],
)
else:
# Deterministic split (no shuffle) — preserve original order
all_indices = list(range(samples))
train_indices = all_indices[:train_samples]
test_indices = all_indices[train_samples : train_samples + test_samples]
self._subset_train = self._subset_class(
framework_dataset, train_indices
)
self._subset_test = (
self._subset_class(framework_dataset, test_indices)
if test_samples > 0
else None
)
self._training_index = list(self._subset_train.indices)
self._testing_index = (
list(self._subset_test.indices) if self._subset_test is not None else []
)
if not test_batch_size and self._subset_test is not None:
test_batch_size = len(self._subset_test)
self._test_ratio = test_ratio
loaders = (
self._subset_loader(self._subset_train, **self._loader_arguments),
self._subset_loader(self._subset_test, batch_size=test_batch_size),
)
return loaders
SkLearnDataManager
SkLearnDataManager(dataset, **kwargs)
Bases: FrameworkDataManager
Class for creating data loaders from dataset for scikit-learn training plans
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
dataset | Dataset | dataset object | required |
**kwargs | dict | arguments for data loader | {} |
Source code in fedbiomed/common/datamanager/_sklearn_data_manager.py
def __init__(self, dataset: Dataset, **kwargs: dict):
"""Class constructor
Args:
dataset: dataset object
**kwargs: arguments for data loader
"""
super().__init__(dataset, **kwargs)
# Note: managing seed to control reproducibility is now done in training plan
# `post_init` method.
# Randomization for sklearn data manager & loader uses only `np.random.seed()`
self._dataset.to_format = DataReturnFormat.SKLEARN
Functions
subset_test
subset_test()
Gets validation subset of the dataset.
Returns:
| Type | Description |
|---|---|
Optional[_SkLearnSubset] | Validation subset |
Source code in fedbiomed/common/datamanager/_sklearn_data_manager.py
def subset_test(self) -> Optional[_SkLearnSubset]:
"""Gets validation subset of the dataset.
Returns:
Validation subset
"""
return self._subset_test
subset_train
subset_train()
Gets train subset of the dataset.
Returns:
| Type | Description |
|---|---|
Optional[_SkLearnSubset] | Train subset |
Source code in fedbiomed/common/datamanager/_sklearn_data_manager.py
def subset_train(self) -> Optional[_SkLearnSubset]:
"""Gets train subset of the dataset.
Returns:
Train subset
"""
return self._subset_train
TorchDataManager
TorchDataManager(dataset, **kwargs)
Bases: FrameworkDataManager
Class for creating data loaders from dataset for Pytorch training plans
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
dataset | Dataset | dataset object | required |
**kwargs | dict | arguments for data loader | {} |
Raises:
| Type | Description |
|---|---|
FedbiomedError | Bad argument type |
Source code in fedbiomed/common/datamanager/_torch_data_manager.py
def __init__(self, dataset: Dataset, **kwargs: dict):
"""Class constructor
Args:
dataset: dataset object
**kwargs: arguments for data loader
Raises:
FedbiomedError: Bad argument type
"""
super().__init__(dataset, **kwargs)
# Note: managing seed to control reproducibility is now done in training plan
# `post_init` method.
# Randomization for torch data manager & loader uses only `torch.manual_seed()`
self._dataset.to_format = DataReturnFormat.TORCH
Functions
load_all_samples
load_all_samples()
Loading all samples as PyTorch DataLoader without splitting.
Returns:
| Type | Description |
|---|---|
PytorchDataLoader | Dataloader for entire datasets. |
Source code in fedbiomed/common/datamanager/_torch_data_manager.py
def load_all_samples(self) -> PytorchDataLoader:
"""Loading all samples as PyTorch DataLoader without splitting.
Returns:
Dataloader for entire datasets. `DataLoader` arguments will be retrieved from the `**kwargs` which
is defined while initializing the class
"""
torch_dataset = _DatasetWrapper(self._dataset)
return self._create_data_loader(torch_dataset, **self._loader_arguments) # type: ignore
subset_test
subset_test()
Gets validation subset of the dataset.
Returns:
| Type | Description |
|---|---|
Optional[_TorchSubset] | Validation subset |
Source code in fedbiomed/common/datamanager/_torch_data_manager.py
def subset_test(self) -> Optional[_TorchSubset]:
"""Gets validation subset of the dataset.
Returns:
Validation subset
"""
return self._subset_test
subset_train
subset_train()
Gets train subset of the dataset.
Returns:
| Type | Description |
|---|---|
Optional[_TorchSubset] | Train subset |
Source code in fedbiomed/common/datamanager/_torch_data_manager.py
def subset_train(self) -> Optional[_TorchSubset]:
"""Gets train subset of the dataset.
Returns:
Train subset
"""
return self._subset_train