Classes that simplify imports from fedbiomed.common._dataset_controller
Classes
Controller
Controller()
Bases: ABC, DataLoadingPlanMixin
Source code in fedbiomed/common/dataset_controller/_controller.py
def __init__(self):
super().__init__() # initialises DataLoadingPlanMixin (self._dlp = None)
Attributes
root property writable
root
Functions
get_sample abstractmethod
get_sample(index)
Retrieve a data sample without applying transforms
Source code in fedbiomed/common/dataset_controller/_controller.py
@abstractmethod
def get_sample(self, index: int) -> Dict[str, Any]:
"""Retrieve a data sample without applying transforms"""
pass
get_types
get_types()
Get type directly from values in dict returned by get_sample
Source code in fedbiomed/common/dataset_controller/_controller.py
def get_types(self):
"""Get `type` directly from values in `dict` returned by `get_sample`"""
return {_k: type(_v).__name__ for _k, _v in self.get_sample(0).items()}
shape
shape()
Get shape directly from values in dict returned by get_sample
Source code in fedbiomed/common/dataset_controller/_controller.py
def shape(self) -> Dict[str, Any]:
"""Get `shape` directly from values in `dict` returned by `get_sample`"""
# Supported: int, float, dict, PIL.Image, None and obj.shape (if available)
# This function can be overwritten for specific cases in child class
sample = self.get_sample(0)
if not isinstance(sample, dict):
raise FedbiomedError(
f"{ErrorNumbers.FB632.value}: Expected `sample` to be a `dict`, got "
f"{type(sample).__name__}"
)
output = {}
for key, val in sample.items():
if hasattr(val, "shape"):
output[key] = val.shape
elif isinstance(val, (int, float)):
output[key] = 1
elif isinstance(val, dict):
output[key] = len(val)
elif isinstance(val, Image):
output[key] = {"size": val.size, "mode": val.mode}
elif val is None:
output[key] = None
else:
raise FedbiomedError(
f"{ErrorNumbers.FB632.value}: Not possible to get shape for value "
f"of key: {key} that is type {type(val).__name__}"
)
return output
validate
validate()
Validates coherence of controller
Raises:
| Type | Description |
|---|---|
FedbiomedError | if coherence issue is found |
Source code in fedbiomed/common/dataset_controller/_controller.py
def validate(self) -> None:
"""Validates coherence of controller
Raises:
FedbiomedError: if coherence issue is found
"""
return None
CustomController
CustomController(root)
Bases: Controller
Controller for user-defined custom datasets.
Stores only the root path. methods are delegated to :class:CustomDataset
Raises:
| Type | Description |
|---|---|
FedbiomedError | if root is not a valid, existing path. |
Source code in fedbiomed/common/dataset_controller/_custom_controller.py
def __init__(self, root: Union[str, Path]) -> None:
"""Args:
root: Dataset root directory (must exist on disk).
Raises:
FedbiomedError: if *root* is not a valid, existing path.
"""
super().__init__() # initialises self._dlp = None via Controller → DataLoadingPlanMixin
self.root = root
self._controller_kwargs = {"root": str(self.root)}
Attributes
root instance-attribute
root = root
Functions
get_sample
get_sample(index)
Not meaningful for custom datasets — data access is owned by CustomDataset.
Source code in fedbiomed/common/dataset_controller/_custom_controller.py
def get_sample(self, index: int):
"""Not meaningful for custom datasets — data access is owned by CustomDataset."""
logger.warning(
"CustomController.get_sample() should not be called; data access is handled by CustomDataset."
)
return None
get_types
get_types()
Not meaningful for custom datasets — type info is owned by CustomDataset.
Source code in fedbiomed/common/dataset_controller/_custom_controller.py
def get_types(self):
"""Not meaningful for custom datasets — type info is owned by CustomDataset."""
logger.warning(
"CustomController.get_types() should not be called; type information is defined by the CustomDataset."
)
return None
shape
shape()
Not meaningful for custom datasets — shape is owned by CustomDataset.
Source code in fedbiomed/common/dataset_controller/_custom_controller.py
def shape(self):
"""Not meaningful for custom datasets — shape is owned by CustomDataset."""
logger.warning(
"CustomController.shape() should not be called; shape is defined by the CustomDataset."
)
return None
ImageFolderController
ImageFolderController(root)
Bases: Controller
Generic ImageFolder where data is arranged like this: root ├── class_x │ ├── xxx.ext │ ├── xxy.ext │ └── ... ├── class_y │ ├── 123.ext │ └── ... └── ...
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
root | Union[str, Path] | Root directory path | required |
Raises:
| Type | Description |
|---|---|
FedbiomedError | if |
Source code in fedbiomed/common/dataset_controller/_image_folder_controller.py
def __init__(self, root: Union[str, Path]) -> None:
"""Constructor of the class
Args:
root: Root directory path
Raises:
FedbiomedError: if `ImageFolder` can not be initialized
"""
self.root = root
try:
_, self._class_to_idx = folder.find_classes(directory=self.root)
self._samples = folder.make_dataset(
directory=self.root,
class_to_idx=self._class_to_idx,
extensions=self._extensions,
is_valid_file=self._is_valid_file,
allow_empty=self._allow_empty,
)
except Exception as e:
raise FedbiomedError(
f"{ErrorNumbers.FB632.value}: "
"Failed to instantiate ImageFolderDataset object"
) from e
self._controller_kwargs = {
"root": str(self.root),
}
Attributes
root instance-attribute
root = root
Functions
get_sample
get_sample(index)
Retrieve a data sample without applying transforms
Source code in fedbiomed/common/dataset_controller/_image_folder_controller.py
def get_sample(self, index: int) -> Dict[str, Any]:
"""Retrieve a data sample without applying transforms"""
try:
path, target = self._samples[index]
data = self._loader(path)
except Exception as e:
raise FedbiomedError(
f"{ErrorNumbers.FB632.value}: Failed to retrieve item at index {index}"
) from e
return {"data": data, "target": target}
MedNistController
MedNistController(root)
Bases: Controller
Generic data controller where the data is arranged in this way: root └── MedNIST ├── AbdomenCT │ ├── 000000.jpeg │ └── ... ├── BreastMRI/ ├── ChestCT/ ├── CXR/ ├── Hand/ └── HeadCT/
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
root | Union[str, Path] | Root directory path | required |
Raises:
| Type | Description |
|---|---|
FedbiomedError | if |
Source code in fedbiomed/common/dataset_controller/_mednist_controller.py
def __init__(self, root: Union[str, Path]) -> None:
"""Constructor of the class
Args:
root: Root directory path
Raises:
FedbiomedError: if `ImageFolder` can not be initialized
"""
self.root = root
if not (self.root / "MedNIST").exists():
download_mednist(self.root)
try:
self._dataset = ImageFolder(self.root / "MedNIST")
except Exception as e:
raise FedbiomedError(
f"{ErrorNumbers.FB632.value}: "
f"The following error raised while loading the data folder: {e}"
) from e
self._controller_kwargs = {
"root": str(self.root),
}
Attributes
root instance-attribute
root = root
Functions
get_sample
get_sample(index)
Retrieve a data sample without applying transforms
Source code in fedbiomed/common/dataset_controller/_mednist_controller.py
def get_sample(self, index: int) -> Dict[str, Any]:
"""Retrieve a data sample without applying transforms"""
try:
data, target = self._dataset[index]
except Exception as e:
raise FedbiomedError(
f"{ErrorNumbers.FB632.value}: Failed to retrieve item at index {index}"
) from e
return {"data": data, "target": target}
MedicalFolderController
MedicalFolderController(root, tabular_file=None, index_col=None, dlp=None, validate=True)
Bases: Controller
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
root | Union[str, Path] | Root directory path | required |
tabular_file | Optional[Union[str, PathLike, Path]] | Path to CSV file containing the demographic information | None |
index_col | Optional[Union[int, str]] | Column name in tabular file containing the subjects names | None |
Raises:
| Type | Description |
|---|---|
FedbiomedError | |
Source code in fedbiomed/common/dataset_controller/_medical_folder_controller.py
def __init__(
self,
root: Union[str, Path],
tabular_file: Optional[Union[str, PathLike, Path]] = None,
index_col: Optional[Union[int, str]] = None,
dlp: Optional[DataLoadingPlan] = None,
validate: bool = True,
):
"""Constructor for class `MedicalFolder`
Args:
root: Root directory path
tabular_file: Path to CSV file containing the demographic information
index_col: Column name in tabular file containing the subjects names
Raises:
FedbiomedError:
- if one in `tabular_file` and `index_col` is given and the other is not
"""
super().__init__() # initialises DataLoadingPlanMixin (self._dlp = None)
self.root = root
self._tabular_file = (
None if tabular_file is None else self._validate_tabular_file(tabular_file)
)
self._index_col = self._validate_index_col(index_col)
# Folder structure <subject>/<modality>/<file> in DataFrame format
self._df_dir = self._make_df_dir(root=self.root, extensions=self._extensions)
# Demographics
if (tabular_file is None) != (index_col is None):
raise FedbiomedError(
f"{ErrorNumbers.FB613.value}: "
"Arguments `tabular_file` and `index_col`, both or none are expected"
)
self._demographics = (
None
if tabular_file is None
else self.read_demographics(tabular_file, index_col)
)
# Data loading plan
if dlp is not None:
self.set_dlp(dlp)
# Function 'validate' instantiates self._controller_kwargs
if validate is True:
self.validate()
Attributes
demographics property
demographics
df_dir property
df_dir
Returns a copy to ensure _df_dir is not modified
index_col property
index_col
modalities property
modalities
Returns keys of dict that maps modalities to folders
root instance-attribute
root = root
subjects property
subjects
tabular_file property
tabular_file
Functions
available_subjects
available_subjects(subjects_from_index, subjects_from_folder=None)
Checks missing subject folders and missing entries in demographics
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
subjects_from_index | Union[list, Series] | Given subject folder names in demographics | required |
subjects_from_folder | Optional[list] | list of subject folder names | None |
Returns:
| Type | Description |
|---|---|
Dict[str, str] | Dict with next keys: |
Dict[str, str] |
|
Dict[str, str] |
|
Dict[str, str] |
|
Source code in fedbiomed/common/dataset_controller/_medical_folder_controller.py
def available_subjects(
self,
subjects_from_index: Union[list, pd.Series],
subjects_from_folder: Optional[list] = None,
) -> Dict[str, str]:
"""Checks missing subject folders and missing entries in demographics
Args:
subjects_from_index: Given subject folder names in demographics
subjects_from_folder: list of subject folder names
Returns:
Dict with next keys:
- missing_folders: subjects in demographics absent in folder structure
- missing_entries: subjects in folder structure absent in demographics
- intersection: subjects present in folder structure and demographics
"""
# Select all subject folders if it is not given
if subjects_from_folder is None:
subjects_from_folder = self.subjects
return {
# Missing subject that will cause warnings
"missing_folders": list(
set(subjects_from_index).difference(subjects_from_folder)
),
# Missing entries that will cause errors
"missing_entries": list(
set(subjects_from_folder).difference(subjects_from_index)
),
# Intersection
"intersection": list(
set(subjects_from_index).intersection(subjects_from_folder)
),
}
demographics_column_names
demographics_column_names(path)
Source code in fedbiomed/common/dataset_controller/_medical_folder_controller.py
def demographics_column_names(self, path: Union[str, Path]):
return self.read_demographics(path).columns.values
get_dataset_type staticmethod
get_dataset_type()
Source code in fedbiomed/common/dataset_controller/_medical_folder_controller.py
@staticmethod
def get_dataset_type() -> DatasetTypes:
return DatasetTypes.MEDICAL_FOLDER
get_sample
get_sample(index)
Retrieve a data sample without applying transforms
Source code in fedbiomed/common/dataset_controller/_medical_folder_controller.py
def get_sample(
self, index: int
) -> Dict[str, NiftiReader.data_type | Dict[str, Any]]:
"""Retrieve a data sample without applying transforms"""
if self._validated is False:
self.validate()
sample = self._samples[index]
try:
data = {
modality: NiftiReader.read(sample[modality])
for modality in self.modalities
}
except Exception as e:
raise FedbiomedError(
f"{ErrorNumbers.FB632.value}: Failed to retrieve item at index {index}"
) from e
if "demographics" in sample:
data["demographics"] = sample["demographics"]
return data
read_demographics staticmethod
read_demographics(tabular_file, index_col=None)
Read demographics tabular file
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
tabular_file | Union[str, PathLike, Path] | path to demographics file | required |
index_col | Optional[Union[int, str]] | Index column that matches | None |
Raises:
| Type | Description |
|---|---|
FedbiomedError | if the file can not be loaded |
Returns:
| Type | Description |
|---|---|
DataFrame | Demographics in DataFrame format |
Source code in fedbiomed/common/dataset_controller/_medical_folder_controller.py
@staticmethod
def read_demographics(
tabular_file: Union[str, PathLike, Path],
index_col: Optional[Union[int, str]] = None,
) -> pd.DataFrame:
"""Read demographics tabular file
Args:
tabular_file: path to demographics file
index_col: Index column that matches <subject>. Defaults to None.
Raises:
FedbiomedError: if the file can not be loaded
Returns:
Demographics in DataFrame format
"""
tabular_file = MedicalFolderController._validate_tabular_file(tabular_file)
try:
# pandas is used for flexibility in handling delimiters and index columns
demographics = CsvReader(tabular_file).data.to_pandas()
if index_col is not None:
if isinstance(index_col, int):
if index_col < 0 or index_col >= len(demographics.columns):
raise FedbiomedError(
f"{ErrorNumbers.FB613.value}: "
f"Index column {index_col} is out of bounds"
)
index_col = demographics.columns[index_col]
demographics = demographics.set_index(index_col)
except FedbiomedError as e:
raise FedbiomedError(
f"{ErrorNumbers.FB613.value}: :"
f"Can not load demographics tabular file. Error message is: {e}"
) from e
length = len(demographics)
logger.info(f"Number of rows in demographics file: {length}")
# Keep the first one in duplicated subjects
demographics = demographics.loc[~demographics.index.duplicated(keep="first")]
if length != len(demographics):
logger.info(f"Length of demographics for unique index {len(demographics)}")
return demographics
set_dlp
set_dlp(dlp)
Ensures validation of controller object after set_dlp is executed
Source code in fedbiomed/common/dataset_controller/_medical_folder_controller.py
def set_dlp(self, dlp):
"""Ensures validation of controller object after set_dlp is executed"""
DataLoadingPlanMixin.set_dlp(self, dlp)
self.validate()
subject_modality_status
subject_modality_status(index=None)
Scans subjects and checks which modalities exist for each subject
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
index | Union[list, Series] | Array-like index that comes from reference csv file. It represents subject folder names. Defaults to None. | None |
Returns: Modality status that indicates which modalities are available per subject
Source code in fedbiomed/common/dataset_controller/_medical_folder_controller.py
def subject_modality_status(self, index: Union[list, pd.Series] = None) -> Dict:
"""Scans subjects and checks which modalities exist for each subject
Args:
index: Array-like index that comes from reference csv file.
It represents subject folder names. Defaults to None.
Returns:
Modality status that indicates which modalities are available per subject
"""
# Pivot into wide format with boolean indicators
df_ = (
self.df_dir.assign(val=True)
.pivot_table(
index="subject",
columns="modality",
values="val",
fill_value=0,
)
.astype(bool)
)
if index is not None:
df_["in_folder"] = True
# Merge with pivot (outer join)
df_ = pd.merge(
df_,
pd.DataFrame(True, index=index, columns=["in_index"]),
left_index=True,
right_index=True,
how="outer",
)
# Fill missing values with False
df_ = df_.fillna(False)
return {
"columns": df_.columns.tolist(),
"data": df_.values.tolist(),
"index": df_.index.tolist(),
}
validate
validate()
Source code in fedbiomed/common/dataset_controller/_medical_folder_controller.py
def validate(self) -> None:
# Filter subjects to contain all modalities
self._modalities, df_dir = self._prepare_df_dir_for_use(self.df_dir, self._dlp)
# Generate list of samples: dict, with demographics and path to modalities
self._subjects, self._samples = self._make_dataset(self.demographics, df_dir)
# Change flag to be able to recover a sample
self._validated = True
# Check if is possible to use `reader` to recover a valid item
_ = self.get_sample(0)
self._controller_kwargs = {
"root": str(self.root),
"tabular_file": (
None if self.tabular_file is None else str(self.tabular_file)
),
"index_col": self.index_col,
"dlp": self._dlp,
}
MedicalFolderLoadingBlockTypes
MedicalFolderLoadingBlockTypes(*args)
Bases: DataLoadingBlockTypes, Enum
Source code in fedbiomed/common/constants.py
def __init__(self, *args):
cls = self.__class__
if not isinstance(self.value, str):
raise ValueError(
"all fields of DataLoadingBlockTypes subclasses must be of str type"
)
if any(self.value == e.value for e in cls):
a = self.name
e = cls(self.value).name
raise ValueError(
f"duplicate values not allowed in DataLoadingBlockTypes and "
f"its subclasses: {a} --> {e}"
)
Attributes
MODALITIES_TO_FOLDERS class-attribute instance-attribute
MODALITIES_TO_FOLDERS = 'modalities_to_folders'
MnistController
MnistController(root, train=True, download=True)
Bases: Controller
Generic Mnist controller where the data is arranged in this way: root └──MNIST └── raw ├── train-images-idx3-ubyte ├── train-labels-idx1-ubyte ├── t10k-images-idx3-ubyte └── t10k-labels-idx1-ubyte
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
root | Union[str, Path] | Root directory path | required |
train | bool | If true then train files are used | True |
download | bool | If true then downloads and extracts the files if they do not exist | True |
Raises:
| Type | Description |
|---|---|
FedbiomedError | if |
Source code in fedbiomed/common/dataset_controller/_mnist_controller.py
def __init__(
self,
root: Union[str, Path],
train: bool = True,
download: bool = True,
) -> None:
"""Constructor of the class
Args:
root: Root directory path
train: If true then train files are used
download: If true then downloads and extracts the files if they do not exist
Raises:
FedbiomedError: if `torchvision.datasets.MNIST` can not be initialized
"""
self.root = root
try:
self._dataset = MNIST(root=self.root, train=train, download=download)
except Exception as e:
raise FedbiomedError(
f"{ErrorNumbers.FB632.value}: "
"Failed to instantiate MnistDataset object. {e}"
) from e
self._controller_kwargs = {
"root": str(self.root),
"train": train,
"download": False,
}
Attributes
root instance-attribute
root = root
Functions
get_sample
get_sample(index)
Retrieve a data sample without applying transforms
Source code in fedbiomed/common/dataset_controller/_mnist_controller.py
def get_sample(self, index: int) -> Dict[str, Any]:
"""Retrieve a data sample without applying transforms"""
try:
data, target = self._dataset[index]
except Exception as e:
raise FedbiomedError(
f"{ErrorNumbers.FB632.value}: Failed to retrieve item at index {index}"
) from e
return {"data": data, "target": target}
TabularController
TabularController(root)
Bases: Controller
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
root | Union[str, Path] | Root directory path | required |
Raises:
| Type | Description |
|---|---|
FedbiomedError | if |
Source code in fedbiomed/common/dataset_controller/_tabular_controller.py
def __init__(
self,
root: Union[str, Path],
) -> None:
"""Constructor of the class
Args:
root: Root directory path
Raises:
FedbiomedError: if `root` does not exist
"""
self.root = root
self._reader = CsvReader(self.root)
self._controller_kwargs = {
"root": str(self.root),
}
Attributes
root instance-attribute
root = root
Functions
get_sample
get_sample(index)
Retrieve a data sample without applying transforms
Source code in fedbiomed/common/dataset_controller/_tabular_controller.py
def get_sample(self, index: int) -> pl.DataFrame: # type: ignore[override]
"""Retrieve a data sample without applying transforms"""
if index >= self.__len__():
raise FedbiomedError(
f"{ErrorNumbers.FB632.value}: Failed to retrieve item at index {index}"
)
return self._reader.get(index)
get_types
get_types()
Get dtypes of the columns in the Tabular dataset
Source code in fedbiomed/common/dataset_controller/_tabular_controller.py
def get_types(self):
"""Get dtypes of the columns in the Tabular dataset"""
return {
col: dtype.__class__.__name__
for col, dtype in self._reader.data.schema.items()
}
normalize_columns
normalize_columns(columns)
Validate and normalize columns to a list of column names
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
columns | Union[Iterable, int, str] | Columns to normalize | required |
Raises:
| Type | Description |
|---|---|
FedbiomedError | if |
Returns:
| Type | Description |
|---|---|
list[str] | List of column names |
Source code in fedbiomed/common/dataset_controller/_tabular_controller.py
def normalize_columns(self, columns: Union[Iterable, int, str]) -> list[str]:
"""Validate and normalize `columns` to a list of column names
Args:
columns: Columns to normalize
Raises:
FedbiomedError: if `columns` is not valid
Returns:
List of column names
"""
return self._reader.normalize_columns(columns=columns)
shape
shape()
Source code in fedbiomed/common/dataset_controller/_tabular_controller.py
def shape(self) -> Dict:
return self._reader.shape()